Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockTriDiContainer_impl.hpp
1/*@HEADER
2// ***********************************************************************
3//
4// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ***********************************************************************
40//@HEADER
41*/
42
43#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45
46#include <Teuchos_Details_MpiTypeTraits.hpp>
47
48#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49#include <Tpetra_Distributor.hpp>
50#include <Tpetra_BlockMultiVector.hpp>
51
52#include <Kokkos_ArithTraits.hpp>
53#include <KokkosBatched_Util.hpp>
54#include <KokkosBatched_Vector.hpp>
55#include <KokkosBatched_Copy_Decl.hpp>
56#include <KokkosBatched_Copy_Impl.hpp>
57#include <KokkosBatched_AddRadial_Decl.hpp>
58#include <KokkosBatched_AddRadial_Impl.hpp>
59#include <KokkosBatched_SetIdentity_Decl.hpp>
60#include <KokkosBatched_SetIdentity_Impl.hpp>
61#include <KokkosBatched_Gemm_Decl.hpp>
62#include <KokkosBatched_Gemm_Serial_Impl.hpp>
63#include <KokkosBatched_Gemm_Team_Impl.hpp>
64#include <KokkosBatched_Gemv_Decl.hpp>
65#include <KokkosBatched_Gemv_Team_Impl.hpp>
66#include <KokkosBatched_Trsm_Decl.hpp>
67#include <KokkosBatched_Trsm_Serial_Impl.hpp>
68#include <KokkosBatched_Trsm_Team_Impl.hpp>
69#include <KokkosBatched_Trsv_Decl.hpp>
70#include <KokkosBatched_Trsv_Serial_Impl.hpp>
71#include <KokkosBatched_Trsv_Team_Impl.hpp>
72#include <KokkosBatched_LU_Decl.hpp>
73#include <KokkosBatched_LU_Serial_Impl.hpp>
74#include <KokkosBatched_LU_Team_Impl.hpp>
75
76#include <KokkosBlas1_nrm1.hpp>
77#include <KokkosBlas1_nrm2.hpp>
78
79#include <memory>
80
81// need to interface this into cmake variable (or only use this flag when it is necessary)
82//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
83//#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
85#include "cuda_profiler_api.h"
86#endif
87
88// I am not 100% sure about the mpi 3 on cuda
89#if MPI_VERSION >= 3
90#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
91#endif
92
93// ::: Experiments :::
94// define either pinned memory or cudamemory for mpi
95// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
96// if defined, this use pinned memory instead of device pointer
97// by default, we enable pinned memory
98#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
99//#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
100
101// if defined, all views are allocated on cuda space intead of cuda uvm space
102#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
103
104// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
105#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
106#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
107#endif
108
109// if defined, it uses multiple execution spaces
110#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
111
112namespace Ifpack2 {
113
114 namespace BlockTriDiContainerDetails {
115
116 namespace KB = KokkosBatched;
117
121 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
122
123 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
124 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
125 MemoryTraitsType::is_random_access |
126 flag>;
127
128 template <typename ViewType>
129 using Unmanaged = Kokkos::View<typename ViewType::data_type,
130 typename ViewType::array_layout,
131 typename ViewType::device_type,
132 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
133 template <typename ViewType>
134 using Atomic = Kokkos::View<typename ViewType::data_type,
135 typename ViewType::array_layout,
136 typename ViewType::device_type,
137 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
138 template <typename ViewType>
139 using Const = Kokkos::View<typename ViewType::const_data_type,
140 typename ViewType::array_layout,
141 typename ViewType::device_type,
142 typename ViewType::memory_traits>;
143 template <typename ViewType>
144 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
145
146 template <typename ViewType>
147 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
148
149 template <typename ViewType>
150 using Unmanaged = Kokkos::View<typename ViewType::data_type,
151 typename ViewType::array_layout,
152 typename ViewType::device_type,
153 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
154
155
156 template <typename ViewType>
157 using Scratch = Kokkos::View<typename ViewType::data_type,
158 typename ViewType::array_layout,
159 typename ViewType::execution_space::scratch_memory_space,
160 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
161
165 template<typename LayoutType> struct TpetraLittleBlock;
166 template<> struct TpetraLittleBlock<Kokkos::LayoutLeft> {
167 template<typename T> KOKKOS_INLINE_FUNCTION
168 static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; }
169 };
170 template<> struct TpetraLittleBlock<Kokkos::LayoutRight> {
171 template<typename T> KOKKOS_INLINE_FUNCTION
172 static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; }
173 };
174
178 template<typename T> struct BlockTridiagScalarType { typedef T type; };
179#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
180 template<> struct BlockTridiagScalarType<double> { typedef float type; };
181 //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
182#endif
183
187 template<typename T> struct is_cuda { enum : bool { value = false }; };
188#if defined(KOKKOS_ENABLE_CUDA)
189 template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
190#endif
191
195 template<typename T> struct is_hip { enum : bool { value = false }; };
196#if defined(KOKKOS_ENABLE_HIP)
197 template<> struct is_hip<Kokkos::Experimental::HIP> { enum : bool { value = true }; };
198#endif
199
203 template<typename T>
205 static void createInstance(T &exec_instance) {
206 exec_instance = T();
207 }
208#if defined(KOKKOS_ENABLE_CUDA)
209 static void createInstance(const cudaStream_t &s, T &exec_instance) {
210 exec_instance = T();
211 }
212#endif
213 };
214
215#if defined(KOKKOS_ENABLE_CUDA)
216 template<>
217 struct ExecutionSpaceFactory<Kokkos::Cuda> {
218 static void createInstance(Kokkos::Cuda &exec_instance) {
219 exec_instance = Kokkos::Cuda();
220 }
221 static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
222 exec_instance = Kokkos::Cuda(s);
223 }
224 };
225#endif
226
227#if defined(KOKKOS_ENABLE_HIP)
228 template<>
229 struct ExecutionSpaceFactory<Kokkos::Experimental::HIP> {
230 static void createInstance(Kokkos::Experimental::HIP &exec_instance) {
231 exec_instance = Kokkos::Experimental::HIP();
232 }
233 };
234#endif
235
239 template<typename CommPtrType>
240 std::string get_msg_prefix (const CommPtrType &comm) {
241 const auto rank = comm->getRank();
242 const auto nranks = comm->getSize();
243 std::stringstream ss;
244 ss << "Rank " << rank << " of " << nranks << ": ";
245 return ss.str();
246 }
247
251 template<typename T, int N>
253 T v[N];
254 KOKKOS_INLINE_FUNCTION
256 for (int i=0;i<N;++i)
257 this->v[i] = 0;
258 }
259 KOKKOS_INLINE_FUNCTION
261 for (int i=0;i<N;++i)
262 this->v[i] = b.v[i];
263 }
264 };
265 template<typename T, int N>
266 static
267 KOKKOS_INLINE_FUNCTION
268 void
269 operator+=(ArrayValueType<T,N> &a,
270 const ArrayValueType<T,N> &b) {
271 for (int i=0;i<N;++i)
272 a.v[i] += b.v[i];
273 }
274
278 template<typename T, int N, typename ExecSpace>
279 struct SumReducer {
280 typedef SumReducer reducer;
282 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
283 value_type *value;
284
285 KOKKOS_INLINE_FUNCTION
286 SumReducer(value_type &val) : value(&val) {}
287
288 KOKKOS_INLINE_FUNCTION
289 void join(value_type &dst, value_type const &src) const {
290 for (int i=0;i<N;++i)
291 dst.v[i] += src.v[i];
292 }
293 KOKKOS_INLINE_FUNCTION
294 void init(value_type &val) const {
295 for (int i=0;i<N;++i)
296 val.v[i] = Kokkos::reduction_identity<T>::sum();
297 }
298 KOKKOS_INLINE_FUNCTION
299 value_type& reference() {
300 return *value;
301 }
302 KOKKOS_INLINE_FUNCTION
303 result_view_type view() const {
304 return result_view_type(value);
305 }
306 };
307
308#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
309#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
310#else
311#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
312#endif
313
314#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
315#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
316 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
317
318#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
319 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
320#else
322#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
323#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
324#endif
325
329 template <typename MatrixType>
330 struct ImplType {
334 typedef size_t size_type;
335 typedef typename MatrixType::scalar_type scalar_type;
336 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
337 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
338 typedef typename MatrixType::node_type node_type;
339
343 typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
344 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
345
346 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
347 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
348
352 typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
353
357 typedef typename node_type::device_type node_device_type;
358 typedef typename node_device_type::execution_space node_execution_space;
359 typedef typename node_device_type::memory_space node_memory_space;
360
361#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
363 typedef node_execution_space execution_space;
364 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
365 Kokkos::CudaSpace,
366 node_memory_space>::type memory_space;
367 typedef Kokkos::Device<execution_space,memory_space> device_type;
368#else
369 typedef node_device_type device_type;
370 typedef node_execution_space execution_space;
371 typedef node_memory_space memory_space;
372#endif
373
374 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
375 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
376 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
377 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
378 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
379 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
380 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
381 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
382
386 template<typename T, int l> using Vector = KB::Vector<T,l>;
387 template<typename T> using SIMD = KB::SIMD<T>;
388 template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
389 template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
390
391 static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
392 static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
393 typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
394 typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
395
399 typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
400 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
401 // tpetra block crs values
402 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
403 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
404
405 // tpetra multivector values (layout left): may need to change the typename more explicitly
406 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
407 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
408
409 // packed data always use layout right
410 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
411 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
412 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
413 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
414 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
415 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
416 };
417
421 template<typename MatrixType>
422 typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
423 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
424 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
425 using impl_type = ImplType<MatrixType>;
426 using tpetra_map_type = typename impl_type::tpetra_map_type;
427 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
428 using tpetra_import_type = typename impl_type::tpetra_import_type;
429
430 const auto g = A->getCrsGraph(); // tpetra crs graph object
431 const auto blocksize = A->getBlockSize();
432 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
433 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
434
435 return Teuchos::rcp(new tpetra_import_type(src, tgt));
436 }
437
438 // Partial replacement for forward-mode MultiVector::doImport.
439 // Permits overlapped communication and computation, but also supports sync'ed.
440 // I'm finding that overlapped comm/comp can give quite poor performance on some
441 // platforms, so we can't just use it straightforwardly always.
442
443 template<typename MatrixType>
444 struct AsyncableImport {
445 public:
446 using impl_type = ImplType<MatrixType>;
447
448 private:
452#if !defined(HAVE_IFPACK2_MPI)
453 typedef int MPI_Request;
454 typedef int MPI_Comm;
455#endif
458 using scalar_type = typename impl_type::scalar_type;
459
460 static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
461#ifdef HAVE_IFPACK2_MPI
462 MPI_Request ureq;
463 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
464 if (ireq == NULL) MPI_Request_free(&ureq);
465 return ret;
466#else
467 return 0;
468#endif
469 }
470
471 static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
472#ifdef HAVE_IFPACK2_MPI
473 MPI_Request ureq;
474 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
475 if (ireq == NULL) MPI_Request_free(&ureq);
476 return ret;
477#else
478 return 0;
479#endif
480 }
481
482 static int waitany(int count, MPI_Request* reqs, int* index) {
483#ifdef HAVE_IFPACK2_MPI
484 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
485#else
486 return 0;
487#endif
488 }
489
490 static int waitall(int count, MPI_Request* reqs) {
491#ifdef HAVE_IFPACK2_MPI
492 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
493#else
494 return 0;
495#endif
496 }
497
498 public:
499 using tpetra_map_type = typename impl_type::tpetra_map_type;
500 using tpetra_import_type = typename impl_type::tpetra_import_type;
501
502 using local_ordinal_type = typename impl_type::local_ordinal_type;
503 using global_ordinal_type = typename impl_type::global_ordinal_type;
504 using size_type = typename impl_type::size_type;
505 using impl_scalar_type = typename impl_type::impl_scalar_type;
506
507 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
508 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
509
510 using execution_space = typename impl_type::execution_space;
511 using memory_space = typename impl_type::memory_space;
512 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
513 using size_type_1d_view = typename impl_type::size_type_1d_view;
514 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
515
516#if defined(KOKKOS_ENABLE_CUDA)
517 using impl_scalar_type_1d_view =
518 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
519# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
520 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
521# elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
522 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
523# else // no experimental macros are defined
524 typename impl_type::impl_scalar_type_1d_view,
525# endif
526 typename impl_type::impl_scalar_type_1d_view>::type;
527#else
528 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
529#endif
530 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
531 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
532
533#ifdef HAVE_IFPACK2_MPI
534 MPI_Comm comm;
535#endif
536
537 impl_scalar_type_2d_view_tpetra remote_multivector;
538 local_ordinal_type blocksize;
539
540 template<typename T>
541 struct SendRecvPair {
542 T send, recv;
543 };
544
545 // (s)end and (r)eceive data:
546 SendRecvPair<int_1d_view_host> pids; // mpi ranks
547 SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
548 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
549 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
550 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
551 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
552
553 local_ordinal_type_1d_view dm2cm; // permutation
554
555#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
556 using exec_instance_1d_std_vector = std::vector<execution_space>;
557 exec_instance_1d_std_vector exec_instances;
558#endif
559
560 // for cuda
561 public:
562 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
563 const size_type_1d_view &offs) {
564 // wrap lens to kokkos view and deep copy to device
565 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
566 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
567
568 // exclusive scan
569 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
570 const local_ordinal_type lens_size = lens_device.extent(0);
571 Kokkos::parallel_scan
572 ("AsyncableImport::RangePolicy::setOffsetValues",
573 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
574 if (final)
575 offs(i) = update;
576 update += (i < lens_size ? lens_device[i] : 0);
577 });
578 }
579
580 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
581 const size_type_1d_view_host &offs) {
582 // wrap lens to kokkos view and deep copy to device
583 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
584 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
585
586 // exclusive scan
587 offs(0) = 0;
588 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
589 offs(i) = offs(i-1) + lens[i-1];
590 }
591 }
592
593 private:
594 void createMpiRequests(const tpetra_import_type &import) {
595 Tpetra::Distributor &distributor = import.getDistributor();
596
597 // copy pids from distributor
598 const auto pids_from = distributor.getProcsFrom();
599 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
600 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
601
602 const auto pids_to = distributor.getProcsTo();
603 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
604 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
605
606 // mpi requests
607 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
608 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
609
610 // construct offsets
611#if 0
612 const auto lengths_to = distributor.getLengthsTo();
613 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
614
615 const auto lengths_from = distributor.getLengthsFrom();
616 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
617
618 setOffsetValues(lengths_to, offset.send);
619 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
620
621 setOffsetValues(lengths_from, offset.recv);
622 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
623#else
624 const auto lengths_to = distributor.getLengthsTo();
625 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
626
627 const auto lengths_from = distributor.getLengthsFrom();
628 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
629
630 setOffsetValuesHost(lengths_to, offset_host.send);
631 //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
632
633 setOffsetValuesHost(lengths_from, offset_host.recv);
634 //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
635#endif
636 }
637
638 void createSendRecvIDs(const tpetra_import_type &import) {
639 // For each remote PID, the list of LIDs to receive.
640 const auto remote_lids = import.getRemoteLIDs();
641 const local_ordinal_type_1d_view_host
642 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
643 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
644 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
645
646 // For each export PID, the list of LIDs to send.
647 auto epids = import.getExportPIDs();
648 auto elids = import.getExportLIDs();
649 TEUCHOS_ASSERT(epids.size() == elids.size());
650 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
651 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
652
653 // naive search (not sure if pids or epids are sorted)
654 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
655 const auto pid_send_value = pids.send[i];
656 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
657 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
658#if !defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__)
659 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
660#endif
661 }
662 Kokkos::deep_copy(lids.send, lids_send_host);
663 }
664
665 void createExecutionSpaceInstances() {
666#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
667 //The following line creates 8 streams:
668 exec_instances =
669 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
670#endif
671 }
672
673 public:
674 // for cuda, all tag types are public
675 struct ToBuffer {};
676 struct ToMultiVector {};
677
678 AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
679 const Teuchos::RCP<const tpetra_map_type>& tgt_map,
680 const local_ordinal_type blocksize_,
681 const local_ordinal_type_1d_view dm2cm_) {
682 blocksize = blocksize_;
683 dm2cm = dm2cm_;
684
685#ifdef HAVE_IFPACK2_MPI
686 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
687#endif
688 const tpetra_import_type import(src_map, tgt_map);
689
690 createMpiRequests(import);
691 createSendRecvIDs(import);
692 createExecutionSpaceInstances();
693 }
694
695 void createDataBuffer(const local_ordinal_type &num_vectors) {
696 const size_type extent_0 = lids.recv.extent(0)*blocksize;
697 const size_type extent_1 = num_vectors;
698 if (remote_multivector.extent(0) == extent_0 &&
699 remote_multivector.extent(1) == extent_1) {
700 // skip
701 } else {
702 remote_multivector =
703 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
704
705 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
706 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
707
708 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
709 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
710 }
711 }
712
713 void cancel () {
714#ifdef HAVE_IFPACK2_MPI
715 waitall(reqs.recv.size(), reqs.recv.data());
716 waitall(reqs.send.size(), reqs.send.data());
717#endif
718 }
719
720 // ======================================================================
721 // Async version using cuda stream
722 // - cuda only with kokkos develop branch
723 // ======================================================================
724
725#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
726 template<typename PackTag>
727 static
728 void copy(const local_ordinal_type_1d_view &lids_,
729 const impl_scalar_type_1d_view &buffer_,
730 const local_ordinal_type ibeg_,
731 const local_ordinal_type iend_,
732 const impl_scalar_type_2d_view_tpetra &multivector_,
733 const local_ordinal_type blocksize_,
734 const execution_space &exec_instance_) {
735 const local_ordinal_type num_vectors = multivector_.extent(1);
736 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
737 const local_ordinal_type idiff = iend_ - ibeg_;
738 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
739
740 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
741 local_ordinal_type vector_size(0);
742 if (blocksize_ <= 4) vector_size = 4;
743 else if (blocksize_ <= 8) vector_size = 8;
744 else if (blocksize_ <= 16) vector_size = 16;
745 else vector_size = 32;
746
747 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
748 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
749 Kokkos::parallel_for
750 (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
751 Kokkos::Experimental::require(policy, work_item_property),
752 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
753 const local_ordinal_type i = member.league_rank();
754 Kokkos::parallel_for
755 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
756 auto aptr = abase + blocksize_*(i + idiff*j);
757 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
758 if (std::is_same<PackTag,ToBuffer>::value)
759 Kokkos::parallel_for
760 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
761 aptr[k] = bptr[k];
762 });
763 else
764 Kokkos::parallel_for
765 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
766 bptr[k] = aptr[k];
767 });
768 });
769 });
770 }
771
772 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
773 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
774
775#ifdef HAVE_IFPACK2_MPI
776 // constants and reallocate data buffers if necessary
777 const local_ordinal_type num_vectors = mv.extent(1);
778 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
779
780 // 0. post receive async
781 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
782 irecv(comm,
783 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
784 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
785 pids.recv[i],
786 42,
787 &reqs.recv[i]);
788 }
789
791 execution_space().fence();
792
793 // 1. async memcpy
794 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
795 // 1.0. enqueue pack buffer
796 if (i<8) exec_instances[i%8].fence();
797 copy<ToBuffer>(lids.send, buffer.send,
798 offset_host.send(i), offset_host.send(i+1),
799 mv, blocksize,
800 //execution_space());
801 exec_instances[i%8]);
802
803 }
805 //execution_space().fence();
806 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
807 // 1.1. sync the stream and isend
808 if (i<8) exec_instances[i%8].fence();
809 isend(comm,
810 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
811 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
812 pids.send[i],
813 42,
814 &reqs.send[i]);
815 }
816
817 // 2. poke communication
818 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
819 int flag;
820 MPI_Status stat;
821 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
822 }
823#endif // HAVE_IFPACK2_MPI
824 }
825
826 void syncRecvVar1() {
827 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
828#ifdef HAVE_IFPACK2_MPI
829 // 0. wait for receive async.
830 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
831 local_ordinal_type idx = i;
832
833 // 0.0. wait any
834 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
835
836 // 0.1. unpack data after data is moved into a device
837 copy<ToMultiVector>(lids.recv, buffer.recv,
838 offset_host.recv(idx), offset_host.recv(idx+1),
839 remote_multivector, blocksize,
840 exec_instances[idx%8]);
841 }
842
843 // 1. fire up all cuda events
844 Kokkos::fence();
845
846 // 2. cleanup all open comm
847 waitall(reqs.send.size(), reqs.send.data());
848#endif // HAVE_IFPACK2_MPI
849 }
850#endif //defined(KOKKOS_ENABLE_CUDA)
851
852 // ======================================================================
853 // Generic version without using cuda stream
854 // - only difference between cuda and host architecture is on using team
855 // or range policies.
856 // ======================================================================
857 template<typename PackTag>
858 static
859 void copy(const local_ordinal_type_1d_view &lids_,
860 const impl_scalar_type_1d_view &buffer_,
861 const local_ordinal_type &ibeg_,
862 const local_ordinal_type &iend_,
863 const impl_scalar_type_2d_view_tpetra &multivector_,
864 const local_ordinal_type blocksize_) {
865 const local_ordinal_type num_vectors = multivector_.extent(1);
866 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
867 const local_ordinal_type idiff = iend_ - ibeg_;
868 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
869 if (is_cuda<execution_space>::value || is_hip<execution_space>::value) {
870#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
871 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
872 local_ordinal_type vector_size(0);
873 if (blocksize_ <= 4) vector_size = 4;
874 else if (blocksize_ <= 8) vector_size = 8;
875 else if (blocksize_ <= 16) vector_size = 16;
876 else vector_size = 32;
877 const team_policy_type policy(idiff, 1, vector_size);
878 Kokkos::parallel_for
879 ("AsyncableImport::TeamPolicy::copy",
880 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
881 const local_ordinal_type i = member.league_rank();
882 Kokkos::parallel_for
883 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
884 auto aptr = abase + blocksize_*(i + idiff*j);
885 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
886 if (std::is_same<PackTag,ToBuffer>::value)
887 Kokkos::parallel_for
888 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
889 aptr[k] = bptr[k];
890 });
891 else
892 Kokkos::parallel_for
893 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
894 bptr[k] = aptr[k];
895 });
896 });
897 });
898#endif
899 } else {
900#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
901 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
902#else
903 {
904 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
905 Kokkos::parallel_for
906 ("AsyncableImport::RangePolicy::copy",
907 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
908 const local_ordinal_type i = ij%idiff;
909 const local_ordinal_type j = ij/idiff;
910 auto aptr = abase + blocksize_*(i + idiff*j);
911 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
912 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
913 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
914 memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
915 });
916 }
917#endif
918 }
919 }
920
921
925 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
926 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
927
928#ifdef HAVE_IFPACK2_MPI
929 // constants and reallocate data buffers if necessary
930 const local_ordinal_type num_vectors = mv.extent(1);
931 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
932
933 // receive async
934 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
935 irecv(comm,
936 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
937 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
938 pids.recv[i],
939 42,
940 &reqs.recv[i]);
941 }
942
943 // send async
944 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
945 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
946 mv, blocksize);
947 Kokkos::fence();
948 isend(comm,
949 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
950 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
951 pids.send[i],
952 42,
953 &reqs.send[i]);
954 }
955
956 // I find that issuing an Iprobe seems to nudge some MPIs into action,
957 // which helps with overlapped comm/comp performance.
958 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
959 int flag;
960 MPI_Status stat;
961 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
962 }
963#endif
964 }
965
966 void syncRecvVar0() {
967 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
968#ifdef HAVE_IFPACK2_MPI
969 // receive async.
970 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
971 local_ordinal_type idx = i;
972 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
973 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
974 remote_multivector, blocksize);
975 }
976 // wait on the sends to match all Isends with a cleanup operation.
977 waitall(reqs.send.size(), reqs.send.data());
978#endif
979 }
980
984 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
985#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
986#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
987 asyncSendRecvVar1(mv);
988#else
989 asyncSendRecvVar0(mv);
990#endif
991#else
992 asyncSendRecvVar0(mv);
993#endif
994 }
995 void syncRecv() {
996#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
997#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
998 syncRecvVar1();
999#else
1000 syncRecvVar0();
1001#endif
1002#else
1003 syncRecvVar0();
1004#endif
1005 }
1006
1007 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
1008 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
1009 asyncSendRecv(mv);
1010 syncRecv();
1011 }
1012
1013 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
1014 };
1015
1019 template<typename MatrixType>
1020 Teuchos::RCP<AsyncableImport<MatrixType> >
1021 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
1022 using impl_type = ImplType<MatrixType>;
1023 using tpetra_map_type = typename impl_type::tpetra_map_type;
1024 using local_ordinal_type = typename impl_type::local_ordinal_type;
1025 using global_ordinal_type = typename impl_type::global_ordinal_type;
1026 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1027
1028 const auto g = A->getCrsGraph(); // tpetra crs graph object
1029 const auto blocksize = A->getBlockSize();
1030 const auto domain_map = g.getDomainMap();
1031 const auto column_map = g.getColMap();
1032
1033 std::vector<global_ordinal_type> gids;
1034 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
1035 for (size_t i=0;i<column_map->getLocalNumElements();++i) {
1036 const global_ordinal_type gid = column_map->getGlobalElement(i);
1037 if (!domain_map->isNodeGlobalElement(gid)) {
1038 found_first = true;
1039 gids.push_back(gid);
1040 } else if (found_first) {
1041 separate_remotes = false;
1042 break;
1043 }
1044 if (!need_owned_permutation &&
1045 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
1046 // The owned part of the domain and column maps are different
1047 // orderings. We *could* do a super efficient impl of this case in the
1048 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
1049 // really, if a caller cares about speed, they wouldn't make different
1050 // local permutations like this. So we punt on the best impl and go for
1051 // a pretty good one: the permutation is done in place in
1052 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
1053 // is the presumably worse memory access pattern of the input vector.
1054 need_owned_permutation = true;
1055 }
1056 }
1057
1058 if (separate_remotes) {
1059 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
1060 const auto parsimonious_col_map
1061 = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1062 if (parsimonious_col_map->getGlobalNumElements() > 0) {
1063 // make the importer only if needed.
1064 local_ordinal_type_1d_view dm2cm;
1065 if (need_owned_permutation) {
1066 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
1067 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1068 for (size_t i=0;i<domain_map->getLocalNumElements();++i)
1069 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1070 Kokkos::deep_copy(dm2cm, dm2cm_host);
1071 }
1072 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1073 }
1074 }
1075 return Teuchos::null;
1076 }
1077
1078 template<typename MatrixType>
1079 struct PartInterface {
1080 using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
1081 using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1082
1083 PartInterface() = default;
1084 PartInterface(const PartInterface &b) = default;
1085
1086 // Some terms:
1087 // The matrix A is split as A = D + R, where D is the matrix of tridiag
1088 // blocks and R is the remainder.
1089 // A part is roughly a synonym for a tridiag. The distinction is that a part
1090 // is the set of rows belonging to one tridiag and, equivalently, the off-diag
1091 // rows in R associated with that tridiag. In contrast, the term tridiag is
1092 // used to refer specifically to tridiag data, such as the pointer into the
1093 // tridiag data array.
1094 // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
1095 // tridiag, and partptr points to the beginning of each tridiag. This is the
1096 // LID space.
1097 // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
1098 // by this ordinal. This is the 'index' space.
1099 // A flat index is the mathematical index into an array. A pack index
1100 // accounts for SIMD packing.
1101
1102 // Local row LIDs. Permutation from caller's index space to tridiag index
1103 // space.
1104 local_ordinal_type_1d_view lclrow;
1105 // partptr_ is the pointer array into lclrow_.
1106 local_ordinal_type_1d_view partptr; // np+1
1107 // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
1108 // is the start of the i'th pack.
1109 local_ordinal_type_1d_view packptr; // npack+1
1110 // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
1111 // an alias of partptr_ in the case of no overlap.
1112 local_ordinal_type_1d_view part2rowidx0; // np+1
1113 // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
1114 // it's the same as part2rowidx0_; if it's > 1, then the value is combined
1115 // with i % vector_length to get the location in the packed data.
1116 local_ordinal_type_1d_view part2packrowidx0; // np+1
1117 local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
1118 // rowidx2part_ maps the row index to the part index.
1119 local_ordinal_type_1d_view rowidx2part; // nr
1120 // True if lcl{row|col} is at most a constant away from row{idx|col}. In
1121 // practice, this knowledge is not particularly useful, as packing for batched
1122 // processing is done at the same time as the permutation from LID to index
1123 // space. But it's easy to detect, so it's recorded in case an optimization
1124 // can be made based on it.
1125 bool row_contiguous;
1126
1127 local_ordinal_type max_partsz;
1128 };
1129
1133 template<typename MatrixType>
1134 PartInterface<MatrixType>
1135 createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1136 const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
1137 using impl_type = ImplType<MatrixType>;
1138 using local_ordinal_type = typename impl_type::local_ordinal_type;
1139 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1140
1141 constexpr int vector_length = impl_type::vector_length;
1142
1143 const auto comm = A->getRowMap()->getComm();
1144
1145 PartInterface<MatrixType> interf;
1146
1147 const bool jacobi = partitions.size() == 0;
1148 const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
1149 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1150
1151#if defined(BLOCKTRIDICONTAINER_DEBUG)
1152 local_ordinal_type nrows = 0;
1153 if (jacobi)
1154 nrows = nparts;
1155 else
1156 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1157
1158 TEUCHOS_TEST_FOR_EXCEPT_MSG
1159 (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1160 << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1161#endif
1162
1163 // permutation vector
1164 std::vector<local_ordinal_type> p;
1165 if (jacobi) {
1166 interf.max_partsz = 1;
1167 } else {
1168 // reorder parts to maximize simd packing efficiency
1169 p.resize(nparts);
1170
1171 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1172 std::vector<size_idx_pair_type> partsz(nparts);
1173 for (local_ordinal_type i=0;i<nparts;++i)
1174 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1175 std::sort(partsz.begin(), partsz.end(),
1176 [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1177 return x.first > y.first;
1178 });
1179 for (local_ordinal_type i=0;i<nparts;++i)
1180 p[i] = partsz[i].second;
1181
1182 interf.max_partsz = partsz[0].first;
1183 }
1184
1185 // allocate parts
1186 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1187 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1188 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1189 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1190 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1191
1192 // mirror to host and compute on host execution space
1193 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1194 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1195 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1196 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1197 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1198
1199 // Determine parts.
1200 interf.row_contiguous = true;
1201 partptr(0) = 0;
1202 part2rowidx0(0) = 0;
1203 part2packrowidx0(0) = 0;
1204 local_ordinal_type pack_nrows = 0;
1205 if (jacobi) {
1206 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1207 const local_ordinal_type ipnrows = 1;
1208 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1209 get_msg_prefix(comm)
1210 << "partition " << p[ip]
1211 << " is empty, which is not allowed.");
1212 //assume No overlap.
1213 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1214 // Since parts are ordered in nonincreasing size, the size of the first
1215 // part in a pack is the size for all parts in the pack.
1216 if (ip % vector_length == 0) pack_nrows = ipnrows;
1217 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1218 const local_ordinal_type os = partptr(ip);
1219 for (local_ordinal_type i=0;i<ipnrows;++i) {
1220 const auto lcl_row = ip;
1221 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1222 get_msg_prefix(comm)
1223 << "partitions[" << p[ip] << "]["
1224 << i << "] = " << lcl_row
1225 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1226 << "].");
1227 lclrow(os+i) = lcl_row;
1228 rowidx2part(os+i) = ip;
1229 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1230 interf.row_contiguous = false;
1231 }
1232 partptr(ip+1) = os + ipnrows;
1233 }
1234 } else {
1235 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1236 const auto* part = &partitions[p[ip]];
1237 const local_ordinal_type ipnrows = part->size();
1238 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1239 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1240 get_msg_prefix(comm)
1241 << "partition " << p[ip]
1242 << " is empty, which is not allowed.");
1243 //assume No overlap.
1244 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1245 // Since parts are ordered in nonincreasing size, the size of the first
1246 // part in a pack is the size for all parts in the pack.
1247 if (ip % vector_length == 0) pack_nrows = ipnrows;
1248 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1249 const local_ordinal_type os = partptr(ip);
1250 for (local_ordinal_type i=0;i<ipnrows;++i) {
1251 const auto lcl_row = (*part)[i];
1252 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1253 get_msg_prefix(comm)
1254 << "partitions[" << p[ip] << "]["
1255 << i << "] = " << lcl_row
1256 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1257 << "].");
1258 lclrow(os+i) = lcl_row;
1259 rowidx2part(os+i) = ip;
1260 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1261 interf.row_contiguous = false;
1262 }
1263 partptr(ip+1) = os + ipnrows;
1264 }
1265 }
1266#if defined(BLOCKTRIDICONTAINER_DEBUG)
1267 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1268#endif
1269 if (lclrow(0) != 0) interf.row_contiguous = false;
1270
1271 Kokkos::deep_copy(interf.partptr, partptr);
1272 Kokkos::deep_copy(interf.lclrow, lclrow);
1273
1274 //assume No overlap. Thus:
1275 interf.part2rowidx0 = interf.partptr;
1276 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1277
1278 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1279 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1280
1281 { // Fill packptr.
1282 local_ordinal_type npacks = 0;
1283 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1284 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1285 ++npacks;
1286 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1287 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1288 packptr(0) = 0;
1289 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1290 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1291 packptr(k++) = ip;
1292 Kokkos::deep_copy(interf.packptr, packptr);
1293 }
1294
1295 return interf;
1296 }
1297
1301 template <typename MatrixType>
1304 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1305 using size_type_1d_view = typename impl_type::size_type_1d_view;
1306 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1307
1308 // flat_td_ptr(i) is the index into flat-array values of the start of the
1309 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1310 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1311 // vector_length is the position in the pack.
1312 size_type_1d_view flat_td_ptr, pack_td_ptr;
1313 // List of local column indices into A from which to grab
1314 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1315 local_ordinal_type_1d_view A_colindsub;
1316 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1317 // tridiag's pack, and i % vector_length gives the position in the pack.
1318 vector_type_3d_view values;
1319
1320 bool is_diagonal_only;
1321
1322 BlockTridiags() = default;
1323 BlockTridiags(const BlockTridiags &b) = default;
1324
1325 // Index into row-major block of a tridiag.
1326 template <typename idx_type>
1327 static KOKKOS_FORCEINLINE_FUNCTION
1328 idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1329 // Given a row of a row-major tridiag, return the index of the first block
1330 // in that row.
1331 template <typename idx_type>
1332 static KOKKOS_FORCEINLINE_FUNCTION
1333 idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1334 // Number of blocks in a tridiag having a given number of rows.
1335 template <typename idx_type>
1336 static KOKKOS_FORCEINLINE_FUNCTION
1337 idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1338 };
1339
1340
1344 template<typename MatrixType>
1346 createBlockTridiags(const PartInterface<MatrixType> &interf) {
1347 using impl_type = ImplType<MatrixType>;
1348 using execution_space = typename impl_type::execution_space;
1349 using local_ordinal_type = typename impl_type::local_ordinal_type;
1350 using size_type = typename impl_type::size_type;
1351 using size_type_1d_view = typename impl_type::size_type_1d_view;
1352
1353 constexpr int vector_length = impl_type::vector_length;
1354
1356
1357 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1358
1359 { // construct the flat index pointers into the tridiag values array.
1360 btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1361 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1362 Kokkos::parallel_scan
1363 ("createBlockTridiags::RangePolicy::flat_td_ptr",
1364 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1365 if (final)
1366 btdm.flat_td_ptr(i) = update;
1367 if (i < ntridiags) {
1368 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1369 update += btdm.NumBlocks(nrows);
1370 }
1371 });
1372
1373 const auto nblocks = Kokkos::create_mirror_view_and_copy
1374 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1375 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1376 }
1377
1378 // And the packed index pointers.
1379 if (vector_length == 1) {
1380 btdm.pack_td_ptr = btdm.flat_td_ptr;
1381 } else {
1382 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1383 btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1384 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1385 Kokkos::parallel_scan
1386 ("createBlockTridiags::RangePolicy::pack_td_ptr",
1387 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1388 const local_ordinal_type parti = interf.packptr(i);
1389 const local_ordinal_type parti_next = interf.packptr(i+1);
1390 if (final) {
1391 const size_type nblks = update;
1392 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1393 btdm.pack_td_ptr(pti) = nblks;
1394 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1395 // last one
1396 if (i == npacks-1)
1397 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1398 }
1399 {
1400 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1401 update += btdm.NumBlocks(nrows);
1402 }
1403 });
1404 }
1405
1406 // values and A_colindsub are created in the symbolic phase
1407
1408 return btdm;
1409 }
1410
1411 // Set the tridiags to be I to the full pack block size. That way, if a
1412 // tridiag within a pack is shorter than the longest one, the extra blocks are
1413 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1414 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1415 // blocks, then the solve proceeds as though the extra blocks aren't
1416 // present. Since this extra work is part of the SIMD calls, it's not actually
1417 // extra work. Instead, it means we don't have to put checks or masks in, or
1418 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1419 // since the numeric phase fills in only the used entries, leaving these I
1420 // blocks intact.
1421 template<typename MatrixType>
1422 void
1423 setTridiagsToIdentity
1424 (const BlockTridiags<MatrixType>& btdm,
1425 const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1426 {
1427 using impl_type = ImplType<MatrixType>;
1428 using execution_space = typename impl_type::execution_space;
1429 using local_ordinal_type = typename impl_type::local_ordinal_type;
1430 using size_type_1d_view = typename impl_type::size_type_1d_view;
1431
1432 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1433 const local_ordinal_type blocksize = btdm.values.extent(1);
1434
1435 {
1436 const int vector_length = impl_type::vector_length;
1437 const int internal_vector_length = impl_type::internal_vector_length;
1438
1439 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1440 using internal_vector_type = typename impl_type::internal_vector_type;
1441 using internal_vector_type_4d_view =
1442 typename impl_type::internal_vector_type_4d_view;
1443
1444 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1445 const internal_vector_type_4d_view values
1446 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1447 btdm.values.extent(0),
1448 btdm.values.extent(1),
1449 btdm.values.extent(2),
1450 vector_length/internal_vector_length);
1451 const local_ordinal_type vector_loop_size = values.extent(3);
1452#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1453 local_ordinal_type total_team_size(0);
1454 if (blocksize <= 5) total_team_size = 32;
1455 else if (blocksize <= 9) total_team_size = 64;
1456 else if (blocksize <= 12) total_team_size = 96;
1457 else if (blocksize <= 16) total_team_size = 128;
1458 else if (blocksize <= 20) total_team_size = 160;
1459 else total_team_size = 160;
1460 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1461 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1462#elif defined(KOKKOS_ENABLE_HIP)
1463 // FIXME: HIP
1464 // These settings might be completely wrong
1465 // will have to do some experiments to decide
1466 // what makes sense on AMD GPUs
1467 local_ordinal_type total_team_size(0);
1468 if (blocksize <= 5) total_team_size = 32;
1469 else if (blocksize <= 9) total_team_size = 64;
1470 else if (blocksize <= 12) total_team_size = 96;
1471 else if (blocksize <= 16) total_team_size = 128;
1472 else if (blocksize <= 20) total_team_size = 160;
1473 else total_team_size = 160;
1474 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1475 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1476#else // Host architecture: team size is always one
1477 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1478#endif
1479 Kokkos::parallel_for
1480 ("setTridiagsToIdentity::TeamPolicy",
1481 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1482 const local_ordinal_type k = member.league_rank();
1483 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1484 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1485 const local_ordinal_type diff = iend - ibeg;
1486 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1487 const btdm_scalar_type one(1);
1488 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1489 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1490 const local_ordinal_type i = ibeg + ii*3;
1491 for (local_ordinal_type j=0;j<blocksize;++j)
1492 values(i,j,j,v) = one;
1493 });
1494 });
1495 });
1496 }
1497 }
1498
1502 template <typename MatrixType>
1503 struct AmD {
1505 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1506 using size_type_1d_view = typename impl_type::size_type_1d_view;
1507 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
1508 // rowptr points to the start of each row of A_colindsub.
1509 size_type_1d_view rowptr, rowptr_remote;
1510 // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1511 // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1512 // where g is A's graph, are the columns AmD uses. If seq_method_, then
1513 // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1514 // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1515 // contains the remote ones.
1516 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1517
1518 // Currently always true.
1519 bool is_tpetra_block_crs;
1520
1521 // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1522 impl_scalar_type_1d_view_tpetra tpetra_values;
1523
1524 AmD() = default;
1525 AmD(const AmD &b) = default;
1526 };
1527
1531 template<typename MatrixType>
1532 void
1533 performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1534 const PartInterface<MatrixType> &interf,
1536 AmD<MatrixType> &amd,
1537 const bool overlap_communication_and_computation) {
1538 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1539
1540 using impl_type = ImplType<MatrixType>;
1541 // using node_memory_space = typename impl_type::node_memory_space;
1542 using host_execution_space = typename impl_type::host_execution_space;
1543
1544 using local_ordinal_type = typename impl_type::local_ordinal_type;
1545 using global_ordinal_type = typename impl_type::global_ordinal_type;
1546 using size_type = typename impl_type::size_type;
1547 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1548 using size_type_1d_view = typename impl_type::size_type_1d_view;
1549 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1550 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1551
1552 constexpr int vector_length = impl_type::vector_length;
1553
1554 const auto comm = A->getRowMap()->getComm();
1555 const auto& g = A->getCrsGraph();
1556 const auto blocksize = A->getBlockSize();
1557
1558 // mirroring to host
1559 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1560 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1561 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1562 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1563 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1564
1565 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1566
1567 // find column to row map on host
1568 Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1569 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1570 {
1571 const auto rowmap = g.getRowMap();
1572 const auto colmap = g.getColMap();
1573 const auto dommap = g.getDomainMap();
1574 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1575
1576#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
1577 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1578 Kokkos::parallel_for
1579 ("performSymbolicPhase::RangePolicy::col2row",
1580 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1581 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1582 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1583 if (dommap->isNodeGlobalElement(gid)) {
1584 const local_ordinal_type lc = colmap->getLocalElement(gid);
1585# if defined(BLOCKTRIDICONTAINER_DEBUG)
1586 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1587 get_msg_prefix(comm) << "GID " << gid
1588 << " gives an invalid local column.");
1589# endif
1590 col2row(lc) = lr;
1591 }
1592 });
1593#endif
1594 }
1595
1596 // construct the D and R graphs in A = D + R.
1597 {
1598 const auto local_graph = g.getLocalGraphHost();
1599 const auto local_graph_rowptr = local_graph.row_map;
1600 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1601 const auto local_graph_colidx = local_graph.entries;
1602
1603 //assume no overlap.
1604
1605 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1606 {
1607 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1608 Kokkos::parallel_for
1609 ("performSymbolicPhase::RangePolicy::lclrow2idx",
1610 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1611 lclrow2idx[lclrow(i)] = i;
1612 });
1613 }
1614
1615 // count (block) nnzs in D and R.
1616 typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1617 typename sum_reducer_type::value_type sum_reducer_value;
1618 {
1619 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1620 Kokkos::parallel_reduce
1621 // profiling interface does not work
1622 (//"performSymbolicPhase::RangePolicy::count_nnz",
1623 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1624 // LID -> index.
1625 const local_ordinal_type ri0 = lclrow2idx[lr];
1626 const local_ordinal_type pi0 = rowidx2part(ri0);
1627 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1628 const local_ordinal_type lc = local_graph_colidx(j);
1629 const local_ordinal_type lc2r = col2row[lc];
1630 bool incr_R = false;
1631 do { // breakable
1632 if (lc2r == (local_ordinal_type) -1) {
1633 incr_R = true;
1634 break;
1635 }
1636 const local_ordinal_type ri = lclrow2idx[lc2r];
1637 const local_ordinal_type pi = rowidx2part(ri);
1638 if (pi != pi0) {
1639 incr_R = true;
1640 break;
1641 }
1642 // Test for being in the tridiag. This is done in index space. In
1643 // LID space, tridiag LIDs in a row are not necessarily related by
1644 // {-1, 0, 1}.
1645 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1646 ++update.v[0]; // D_nnz
1647 else
1648 incr_R = true;
1649 } while (0);
1650 if (incr_R) {
1651 if (lc < nrows) ++update.v[1]; // R_nnz_owned
1652 else ++update.v[2]; // R_nnz_remote
1653 }
1654 }
1655 }, sum_reducer_type(sum_reducer_value));
1656 }
1657 size_type D_nnz = sum_reducer_value.v[0];
1658 size_type R_nnz_owned = sum_reducer_value.v[1];
1659 size_type R_nnz_remote = sum_reducer_value.v[2];
1660
1661 if (!overlap_communication_and_computation) {
1662 R_nnz_owned += R_nnz_remote;
1663 R_nnz_remote = 0;
1664 }
1665
1666 // construct the D graph.
1667 {
1668 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1669
1670 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1671 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1672
1673#if defined(BLOCKTRIDICONTAINER_DEBUG)
1674 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1675#endif
1676
1677 const local_ordinal_type nparts = partptr.extent(0) - 1;
1678 {
1679 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1680 Kokkos::parallel_for
1681 ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1682 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1683 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1684 local_ordinal_type offset = 0;
1685 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1686 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1687 offset = 1;
1688 const local_ordinal_type lr0 = lclrow(ri0);
1689 const size_type j0 = local_graph_rowptr(lr0);
1690 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1691 const local_ordinal_type lc = local_graph_colidx(j);
1692 const local_ordinal_type lc2r = col2row[lc];
1693 if (lc2r == (local_ordinal_type) -1) continue;
1694 const local_ordinal_type ri = lclrow2idx[lc2r];
1695 const local_ordinal_type pi = rowidx2part(ri);
1696 if (pi != pi0) continue;
1697 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1698 const local_ordinal_type row_entry = j - j0;
1699 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1700 }
1701 }
1702 });
1703 }
1704#if defined(BLOCKTRIDICONTAINER_DEBUG)
1705 for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1706 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1707#endif
1708 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1709
1710 // Allocate values.
1711 {
1712 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1713 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1714 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1715 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1716 }
1717 }
1718
1719 // Construct the R graph.
1720 {
1721 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1722 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
1723
1724 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1725 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1726
1727 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1728 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
1729
1730 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1731 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1732
1733 {
1734 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1735 Kokkos::parallel_for
1736 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1737 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1738 const local_ordinal_type ri0 = lclrow2idx[lr];
1739 const local_ordinal_type pi0 = rowidx2part(ri0);
1740 const size_type j0 = local_graph_rowptr(lr);
1741 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1742 const local_ordinal_type lc = local_graph_colidx(j);
1743 const local_ordinal_type lc2r = col2row[lc];
1744 if (lc2r != (local_ordinal_type) -1) {
1745 const local_ordinal_type ri = lclrow2idx[lc2r];
1746 const local_ordinal_type pi = rowidx2part(ri);
1747 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1748 continue;
1749 }
1750 }
1751 // exclusive scan will be performed later
1752 if (!overlap_communication_and_computation || lc < nrows) {
1753 ++R_rowptr(lr);
1754 } else {
1755 ++R_rowptr_remote(lr);
1756 }
1757 }
1758 });
1759 }
1760
1761 // exclusive scan
1762 typedef ArrayValueType<size_type,2> update_type;
1763 {
1764 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1765 Kokkos::parallel_scan
1766 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1767 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1768 update_type &update,
1769 const bool &final) {
1770 update_type val;
1771 val.v[0] = R_rowptr(lr);
1772 if (overlap_communication_and_computation)
1773 val.v[1] = R_rowptr_remote(lr);
1774
1775 if (final) {
1776 R_rowptr(lr) = update.v[0];
1777 if (overlap_communication_and_computation)
1778 R_rowptr_remote(lr) = update.v[1];
1779
1780 if (lr < nrows) {
1781 const local_ordinal_type ri0 = lclrow2idx[lr];
1782 const local_ordinal_type pi0 = rowidx2part(ri0);
1783
1784 size_type cnt_rowptr = R_rowptr(lr);
1785 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1786
1787 const size_type j0 = local_graph_rowptr(lr);
1788 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1789 const local_ordinal_type lc = local_graph_colidx(j);
1790 const local_ordinal_type lc2r = col2row[lc];
1791 if (lc2r != (local_ordinal_type) -1) {
1792 const local_ordinal_type ri = lclrow2idx[lc2r];
1793 const local_ordinal_type pi = rowidx2part(ri);
1794 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1795 continue;
1796 }
1797 const local_ordinal_type row_entry = j - j0;
1798 if (!overlap_communication_and_computation || lc < nrows)
1799 R_A_colindsub(cnt_rowptr++) = row_entry;
1800 else
1801 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1802 }
1803 }
1804 }
1805 update += val;
1806 });
1807 }
1808 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1809 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1810 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1811 if (overlap_communication_and_computation) {
1812 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1813 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1814 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1815 }
1816
1817 // Allocate or view values.
1818 amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
1819
1820 }
1821 }
1822 }
1823
1824
1828 template<typename ArgActiveExecutionMemorySpace>
1830
1831 template<>
1832 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1833 typedef KB::Mode::Serial mode_type;
1834#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1835 typedef KB::Algo::Level3::CompactMKL algo_type;
1836#else
1837 typedef KB::Algo::Level3::Blocked algo_type;
1838#endif
1839 static int recommended_team_size(const int /* blksize */,
1840 const int /* vector_length */,
1841 const int /* internal_vector_length */) {
1842 return 1;
1843 }
1844
1845 };
1846
1847#if defined(KOKKOS_ENABLE_CUDA)
1848 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1849 const int vector_length,
1850 const int internal_vector_length) {
1851 const int vector_size = vector_length/internal_vector_length;
1852 int total_team_size(0);
1853 if (blksize <= 5) total_team_size = 32;
1854 else if (blksize <= 9) total_team_size = 32; // 64
1855 else if (blksize <= 12) total_team_size = 96;
1856 else if (blksize <= 16) total_team_size = 128;
1857 else if (blksize <= 20) total_team_size = 160;
1858 else total_team_size = 160;
1859 return 2*total_team_size/vector_size;
1860 }
1861 template<>
1862 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1863 typedef KB::Mode::Team mode_type;
1864 typedef KB::Algo::Level3::Unblocked algo_type;
1865 static int recommended_team_size(const int blksize,
1866 const int vector_length,
1867 const int internal_vector_length) {
1868 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1869 }
1870 };
1871 template<>
1872 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1873 typedef KB::Mode::Team mode_type;
1874 typedef KB::Algo::Level3::Unblocked algo_type;
1875 static int recommended_team_size(const int blksize,
1876 const int vector_length,
1877 const int internal_vector_length) {
1878 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1879 }
1880 };
1881#endif
1882
1883#if defined(KOKKOS_ENABLE_HIP)
1884 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
1885 const int vector_length,
1886 const int internal_vector_length) {
1887 const int vector_size = vector_length/internal_vector_length;
1888 int total_team_size(0);
1889 if (blksize <= 5) total_team_size = 32;
1890 else if (blksize <= 9) total_team_size = 32; // 64
1891 else if (blksize <= 12) total_team_size = 96;
1892 else if (blksize <= 16) total_team_size = 128;
1893 else if (blksize <= 20) total_team_size = 160;
1894 else total_team_size = 160;
1895 return 2*total_team_size/vector_size;
1896 }
1897 template<>
1898 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
1899 typedef KB::Mode::Team mode_type;
1900 typedef KB::Algo::Level3::Unblocked algo_type;
1901 static int recommended_team_size(const int blksize,
1902 const int vector_length,
1903 const int internal_vector_length) {
1904 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1905 }
1906 };
1907 template<>
1908 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
1909 typedef KB::Mode::Team mode_type;
1910 typedef KB::Algo::Level3::Unblocked algo_type;
1911 static int recommended_team_size(const int blksize,
1912 const int vector_length,
1913 const int internal_vector_length) {
1914 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
1915 }
1916 };
1917#endif
1918
1919 template<typename MatrixType>
1920 struct ExtractAndFactorizeTridiags {
1921 public:
1922 using impl_type = ImplType<MatrixType>;
1923 // a functor cannot have both device_type and execution_space; specialization error in kokkos
1924 using execution_space = typename impl_type::execution_space;
1925 using memory_space = typename impl_type::memory_space;
1927 using local_ordinal_type = typename impl_type::local_ordinal_type;
1928 using size_type = typename impl_type::size_type;
1929 using impl_scalar_type = typename impl_type::impl_scalar_type;
1930 using magnitude_type = typename impl_type::magnitude_type;
1932 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1934 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1935 using size_type_1d_view = typename impl_type::size_type_1d_view;
1936 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
1938 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1939 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
1940 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1941 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1942 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
1943 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1944 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1945
1946 using internal_vector_type = typename impl_type::internal_vector_type;
1947 static constexpr int vector_length = impl_type::vector_length;
1948 static constexpr int internal_vector_length = impl_type::internal_vector_length;
1949
1951 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1952 using member_type = typename team_policy_type::member_type;
1953
1954 private:
1955 // part interface
1956 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1957 const local_ordinal_type max_partsz;
1958 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1959 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
1960 const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1961 const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1962 // block tridiags
1963 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1964 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1965 const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1966 const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1967 // shared information
1968 const local_ordinal_type blocksize, blocksize_square;
1969 // diagonal safety
1970 const magnitude_type tiny;
1971 const local_ordinal_type vector_loop_size;
1972 const local_ordinal_type vector_length_value;
1973
1974 public:
1975 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
1976 const PartInterface<MatrixType> &interf_,
1977 const Teuchos::RCP<const block_crs_matrix_type> &A_,
1978 const magnitude_type& tiny_) :
1979 // interface
1980 partptr(interf_.partptr),
1981 lclrow(interf_.lclrow),
1982 packptr(interf_.packptr),
1983 max_partsz(interf_.max_partsz),
1984 // block crs matrix
1985 A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
1986 A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
1987 // block tridiags
1988 pack_td_ptr(btdm_.pack_td_ptr),
1989 flat_td_ptr(btdm_.flat_td_ptr),
1990 A_colindsub(btdm_.A_colindsub),
1991 internal_vector_values((internal_vector_type*)btdm_.values.data(),
1992 btdm_.values.extent(0),
1993 btdm_.values.extent(1),
1994 btdm_.values.extent(2),
1995 vector_length/internal_vector_length),
1996 scalar_values((btdm_scalar_type*)btdm_.values.data(),
1997 btdm_.values.extent(0),
1998 btdm_.values.extent(1),
1999 btdm_.values.extent(2),
2000 vector_length),
2001 blocksize(btdm_.values.extent(1)),
2002 blocksize_square(blocksize*blocksize),
2003 // diagonal weight to avoid zero pivots
2004 tiny(tiny_),
2005 vector_loop_size(vector_length/internal_vector_length),
2006 vector_length_value(vector_length) {}
2007
2008 private:
2009
2010 KOKKOS_INLINE_FUNCTION
2011 void
2012 extract(local_ordinal_type partidx,
2013 local_ordinal_type npacks) const {
2014 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2015 const size_type kps = pack_td_ptr(partidx);
2016 local_ordinal_type kfs[vector_length] = {};
2017 local_ordinal_type ri0[vector_length] = {};
2018 local_ordinal_type nrows[vector_length] = {};
2019
2020 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2021 kfs[vi] = flat_td_ptr(partidx);
2022 ri0[vi] = partptr(partidx);
2023 nrows[vi] = partptr(partidx+1) - ri0[vi];
2024 }
2025 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
2026 for (local_ordinal_type e=0;e<3;++e) {
2027 const impl_scalar_type* block[vector_length] = {};
2028 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2029 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2030 block[vi] = &A_values(Aj*blocksize_square);
2031 }
2032 const size_type pi = kps + j;
2033 ++j;
2034 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2035 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2036 //const auto idx = ii*blocksize + jj;
2037 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2038 auto& v = internal_vector_values(pi, ii, jj, 0);
2039 for (local_ordinal_type vi=0;vi<npacks;++vi)
2040 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2041 }
2042 }
2043
2044 if (nrows[0] == 1) break;
2045 if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2046 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2047 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2048 npacks = vi;
2049 break;
2050 }
2051 }
2052 }
2053 }
2054 }
2055
2056 KOKKOS_INLINE_FUNCTION
2057 void
2058 extract(const member_type &member,
2059 const local_ordinal_type &partidxbeg,
2060 const local_ordinal_type &npacks,
2061 const local_ordinal_type &vbeg) const {
2062 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2063 local_ordinal_type kfs_vals[internal_vector_length] = {};
2064 local_ordinal_type ri0_vals[internal_vector_length] = {};
2065 local_ordinal_type nrows_vals[internal_vector_length] = {};
2066
2067 const size_type kps = pack_td_ptr(partidxbeg);
2068 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2069 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
2070 ri0_vals[vi] = partptr(partidxbeg+vi);
2071 nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
2072 }
2073
2074 local_ordinal_type j_vals[internal_vector_length] = {};
2075 for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
2076 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2077 const local_ordinal_type nrows = nrows_vals[vi];
2078 if (tr < nrows) {
2079 auto &j = j_vals[vi];
2080 const local_ordinal_type kfs = kfs_vals[vi];
2081 const local_ordinal_type ri0 = ri0_vals[vi];
2082 const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
2083 const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
2084 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2085 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2086 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2087 const size_type pi = kps + j;
2088 Kokkos::parallel_for
2089 (Kokkos::TeamThreadRange(member,blocksize),
2090 [&](const local_ordinal_type &ii) {
2091 for (local_ordinal_type jj=0;jj<blocksize;++jj)
2092 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
2093 });
2094 }
2095 }
2096 }
2097 }
2098 }
2099
2100 template<typename AAViewType,
2101 typename WWViewType>
2102 KOKKOS_INLINE_FUNCTION
2103 void
2104 factorize(const member_type &member,
2105 const local_ordinal_type &i0,
2106 const local_ordinal_type &nrows,
2107 const local_ordinal_type &v,
2108 const AAViewType &AA,
2109 const WWViewType &WW) const {
2110
2111 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2112 <typename execution_space::memory_space> default_mode_and_algo_type;
2113
2114 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2115 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
2116
2117 // constant
2118 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2119
2120 // subview pattern
2121 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2122 KB::LU<member_type,
2123 default_mode_type,KB::Algo::LU::Unblocked>
2124 ::invoke(member, A , tiny);
2125
2126 if (nrows > 1) {
2127 auto B = A;
2128 auto C = A;
2129 local_ordinal_type i = i0;
2130 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2131 B.assign_data( &AA(i+1,0,0,v) );
2132 KB::Trsm<member_type,
2133 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2134 default_mode_type,default_algo_type>
2135 ::invoke(member, one, A, B);
2136 C.assign_data( &AA(i+2,0,0,v) );
2137 KB::Trsm<member_type,
2138 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2139 default_mode_type,default_algo_type>
2140 ::invoke(member, one, A, C);
2141 A.assign_data( &AA(i+3,0,0,v) );
2142
2143 member.team_barrier();
2144 KB::Gemm<member_type,
2145 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2146 default_mode_type,default_algo_type>
2147 ::invoke(member, -one, C, B, one, A);
2148 KB::LU<member_type,
2149 default_mode_type,KB::Algo::LU::Unblocked>
2150 ::invoke(member, A, tiny);
2151 }
2152 } else {
2153 // for block jacobi invert a matrix here
2154 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2155 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2156 ::invoke(member, A, W);
2157 KB::SetIdentity<member_type,default_mode_type>
2158 ::invoke(member, A);
2159 member.team_barrier();
2160 KB::Trsm<member_type,
2161 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2162 default_mode_type,default_algo_type>
2163 ::invoke(member, one, W, A);
2164 KB::Trsm<member_type,
2165 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2166 default_mode_type,default_algo_type>
2167 ::invoke(member, one, W, A);
2168 }
2169 }
2170
2171 public:
2172
2173 struct ExtractAndFactorizeTag {};
2174
2175 KOKKOS_INLINE_FUNCTION
2176 void
2177 operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
2178 // btdm is packed and sorted from largest one
2179 const local_ordinal_type packidx = member.league_rank();
2180
2181 const local_ordinal_type partidx = packptr(packidx);
2182 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2183 const local_ordinal_type i0 = pack_td_ptr(partidx);
2184 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2185
2186 internal_vector_scratch_type_3d_view
2187 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2188 if (vector_loop_size == 1) {
2189 extract(partidx, npacks);
2190 factorize(member, i0, nrows, 0, internal_vector_values, WW);
2191 } else {
2192 Kokkos::parallel_for
2193 (Kokkos::ThreadVectorRange(member, vector_loop_size),
2194 [&](const local_ordinal_type &v) {
2195 const local_ordinal_type vbeg = v*internal_vector_length;
2196 if (vbeg < npacks)
2197 extract(member, partidx+vbeg, npacks, vbeg);
2198 // this is not safe if vector loop size is different from vector size of
2199 // the team policy. we always make sure this when constructing the team policy
2200 member.team_barrier();
2201 factorize(member, i0, nrows, v, internal_vector_values, WW);
2202 });
2203 }
2204 }
2205
2206 void run() {
2207 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2208 const local_ordinal_type team_size =
2209 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2210 recommended_team_size(blocksize, vector_length, internal_vector_length);
2211 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2212 shmem_size(blocksize, blocksize, vector_loop_size);
2213
2214 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2215 policy(packptr.extent(0)-1, team_size, vector_loop_size);
2216#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2217 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2218 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
2219#else
2220 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2221 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2222 policy, *this);
2223#endif
2224 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2225 }
2226
2227 };
2228
2232 template<typename MatrixType>
2233 void
2234 performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
2235 const PartInterface<MatrixType> &interf,
2237 const typename ImplType<MatrixType>::magnitude_type tiny) {
2238 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
2239 ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
2240 function.run();
2241 }
2242
2246 template<typename MatrixType>
2248 public:
2250 using execution_space = typename impl_type::execution_space;
2251 using memory_space = typename impl_type::memory_space;
2252
2253 using local_ordinal_type = typename impl_type::local_ordinal_type;
2254 using impl_scalar_type = typename impl_type::impl_scalar_type;
2255 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2256 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
2257 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2258 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2259 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2260 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
2261 static constexpr int vector_length = impl_type::vector_length;
2262
2263 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2264
2265 private:
2266 // part interface
2267 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2268 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2269 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2270 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2271 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2272 const local_ordinal_type blocksize;
2273 const local_ordinal_type num_vectors;
2274
2275 // packed multivector output (or input)
2276 vector_type_3d_view packed_multivector;
2277 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
2278
2279 template<typename TagType>
2280 KOKKOS_INLINE_FUNCTION
2281 void copy_multivectors(const local_ordinal_type &j,
2282 const local_ordinal_type &vi,
2283 const local_ordinal_type &pri,
2284 const local_ordinal_type &ri0) const {
2285 for (local_ordinal_type col=0;col<num_vectors;++col)
2286 for (local_ordinal_type i=0;i<blocksize;++i)
2287 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2288 }
2289
2290 public:
2291
2292 MultiVectorConverter(const PartInterface<MatrixType> &interf,
2293 const vector_type_3d_view &pmv)
2294 : partptr(interf.partptr),
2295 packptr(interf.packptr),
2296 part2packrowidx0(interf.part2packrowidx0),
2297 part2rowidx0(interf.part2rowidx0),
2298 lclrow(interf.lclrow),
2299 blocksize(pmv.extent(1)),
2300 num_vectors(pmv.extent(2)),
2301 packed_multivector(pmv) {}
2302
2303 // TODO:: modify this routine similar to the team level functions
2304 // inline ---> FIXME HIP: should not need the KOKKOS_INLINE_FUNCTION below...
2305 KOKKOS_INLINE_FUNCTION
2306 void
2307 operator() (const local_ordinal_type &packidx) const {
2308 local_ordinal_type partidx = packptr(packidx);
2309 local_ordinal_type npacks = packptr(packidx+1) - partidx;
2310 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2311
2312 local_ordinal_type ri0[vector_length] = {};
2313 local_ordinal_type nrows[vector_length] = {};
2314 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2315 ri0[v] = part2rowidx0(partidx);
2316 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2317 }
2318 for (local_ordinal_type j=0;j<nrows[0];++j) {
2319 local_ordinal_type cnt = 1;
2320 for (;cnt<npacks && j!= nrows[cnt];++cnt);
2321 npacks = cnt;
2322 const local_ordinal_type pri = pri0 + j;
2323 for (local_ordinal_type col=0;col<num_vectors;++col)
2324 for (local_ordinal_type i=0;i<blocksize;++i)
2325 for (local_ordinal_type v=0;v<npacks;++v)
2326 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2327 }
2328 }
2329
2330 KOKKOS_INLINE_FUNCTION
2331 void
2332 operator() (const member_type &member) const {
2333 const local_ordinal_type packidx = member.league_rank();
2334 const local_ordinal_type partidx_begin = packptr(packidx);
2335 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2336 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2337 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
2338 const local_ordinal_type partidx = partidx_begin + v;
2339 const local_ordinal_type ri0 = part2rowidx0(partidx);
2340 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2341
2342 if (nrows == 1) {
2343 const local_ordinal_type pri = pri0;
2344 for (local_ordinal_type col=0;col<num_vectors;++col) {
2345 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
2346 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2347 });
2348 }
2349 } else {
2350 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
2351 const local_ordinal_type pri = pri0 + j;
2352 for (local_ordinal_type col=0;col<num_vectors;++col)
2353 for (local_ordinal_type i=0;i<blocksize;++i)
2354 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2355 });
2356 }
2357 });
2358 }
2359
2360 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2361 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2362 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
2363
2364 scalar_multivector = scalar_multivector_;
2366#if defined(KOKKOS_ENABLE_CUDA)
2367 const local_ordinal_type vl = vector_length;
2368 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2369 Kokkos::parallel_for
2370 ("MultiVectorConverter::TeamPolicy", policy, *this);
2371#endif
2373#if defined(KOKKOS_ENABLE_HIP)
2374 const local_ordinal_type vl = vector_length;
2375 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2376 Kokkos::parallel_for
2377 ("MultiVectorConverter::TeamPolicy", policy, *this);
2378#endif
2379 } else {
2380#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2381 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
2382#else
2383 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2384 Kokkos::parallel_for
2385 ("MultiVectorConverter::RangePolicy", policy, *this);
2386#endif
2387 }
2388 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2389 }
2390 };
2391
2395 template<typename ArgActiveExecutionMemorySpace>
2397
2398 template<>
2399 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2400 typedef KB::Mode::Serial mode_type;
2401 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2402#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2403 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2404#else
2405 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2406#endif
2407 static int recommended_team_size(const int /* blksize */,
2408 const int /* vector_length */,
2409 const int /* internal_vector_length */) {
2410 return 1;
2411 }
2412 };
2413
2414#if defined(KOKKOS_ENABLE_CUDA)
2415 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2416 const int vector_length,
2417 const int internal_vector_length) {
2418 const int vector_size = vector_length/internal_vector_length;
2419 int total_team_size(0);
2420 if (blksize <= 5) total_team_size = 32;
2421 else if (blksize <= 9) total_team_size = 32; // 64
2422 else if (blksize <= 12) total_team_size = 96;
2423 else if (blksize <= 16) total_team_size = 128;
2424 else if (blksize <= 20) total_team_size = 160;
2425 else total_team_size = 160;
2426 return total_team_size/vector_size;
2427 }
2428
2429 template<>
2430 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2431 typedef KB::Mode::Team mode_type;
2432 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2433 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2434 static int recommended_team_size(const int blksize,
2435 const int vector_length,
2436 const int internal_vector_length) {
2437 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2438 }
2439 };
2440 template<>
2441 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2442 typedef KB::Mode::Team mode_type;
2443 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2444 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2445 static int recommended_team_size(const int blksize,
2446 const int vector_length,
2447 const int internal_vector_length) {
2448 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2449 }
2450 };
2451#endif
2452
2453#if defined(KOKKOS_ENABLE_HIP)
2454 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
2455 const int vector_length,
2456 const int internal_vector_length) {
2457 const int vector_size = vector_length/internal_vector_length;
2458 int total_team_size(0);
2459 if (blksize <= 5) total_team_size = 32;
2460 else if (blksize <= 9) total_team_size = 32; // 64
2461 else if (blksize <= 12) total_team_size = 96;
2462 else if (blksize <= 16) total_team_size = 128;
2463 else if (blksize <= 20) total_team_size = 160;
2464 else total_team_size = 160;
2465 return total_team_size/vector_size;
2466 }
2467
2468 template<>
2469 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPSpace> {
2470 typedef KB::Mode::Team mode_type;
2471 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2472 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2473 static int recommended_team_size(const int blksize,
2474 const int vector_length,
2475 const int internal_vector_length) {
2476 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2477 }
2478 };
2479 template<>
2480 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::HIPHostPinnedSpace> {
2481 typedef KB::Mode::Team mode_type;
2482 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2483 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2484 static int recommended_team_size(const int blksize,
2485 const int vector_length,
2486 const int internal_vector_length) {
2487 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2488 }
2489 };
2490#endif
2491
2492 template<typename MatrixType>
2493 struct SolveTridiags {
2494 public:
2495 using impl_type = ImplType<MatrixType>;
2496 using execution_space = typename impl_type::execution_space;
2497
2498 using local_ordinal_type = typename impl_type::local_ordinal_type;
2499 using size_type = typename impl_type::size_type;
2500 using impl_scalar_type = typename impl_type::impl_scalar_type;
2501 using magnitude_type = typename impl_type::magnitude_type;
2502 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2503 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2505 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2506 using size_type_1d_view = typename impl_type::size_type_1d_view;
2508 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2509 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2510 //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2511
2512 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2513
2514 using internal_vector_type =typename impl_type::internal_vector_type;
2515 static constexpr int vector_length = impl_type::vector_length;
2516 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2517
2519 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2520 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
2521
2523 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2524 using member_type = typename team_policy_type::member_type;
2525
2526 private:
2527 // part interface
2528 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2529 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2530 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2531 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2532
2533 // block tridiags
2534 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2535
2536 // block tridiags values
2537 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2538 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2539
2540 const local_ordinal_type vector_loop_size;
2541
2542 // copy to multivectors : damping factor and Y_scalar_multivector
2543 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2544#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
2545 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2546#else
2547 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2548#endif
2549 const impl_scalar_type df;
2550 const bool compute_diff;
2551
2552 public:
2553 SolveTridiags(const PartInterface<MatrixType> &interf,
2554 const BlockTridiags<MatrixType> &btdm,
2555 const vector_type_3d_view &pmv,
2556 const impl_scalar_type damping_factor,
2557 const bool is_norm_manager_active)
2558 :
2559 // interface
2560 partptr(interf.partptr),
2561 packptr(interf.packptr),
2562 part2packrowidx0(interf.part2packrowidx0),
2563 lclrow(interf.lclrow),
2564 // block tridiags and multivector
2565 pack_td_ptr(btdm.pack_td_ptr),
2566 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2567 btdm.values.extent(0),
2568 btdm.values.extent(1),
2569 btdm.values.extent(2),
2570 vector_length/internal_vector_length),
2571 X_internal_vector_values((internal_vector_type*)pmv.data(),
2572 pmv.extent(0),
2573 pmv.extent(1),
2574 pmv.extent(2),
2575 vector_length/internal_vector_length),
2576 vector_loop_size(vector_length/internal_vector_length),
2577 Y_scalar_multivector(),
2578 Z_scalar_vector(),
2579 df(damping_factor),
2580 compute_diff(is_norm_manager_active)
2581 {}
2582
2583 public:
2584
2586 KOKKOS_INLINE_FUNCTION
2587 void
2588 copyToFlatMultiVector(const member_type &member,
2589 const local_ordinal_type partidxbeg, // partidx for v = 0
2590 const local_ordinal_type npacks,
2591 const local_ordinal_type pri0,
2592 const local_ordinal_type v, // index with a loop of vector_loop_size
2593 const local_ordinal_type blocksize,
2594 const local_ordinal_type num_vectors) const {
2595 const local_ordinal_type vbeg = v*internal_vector_length;
2596 if (vbeg < npacks) {
2597 local_ordinal_type ri0_vals[internal_vector_length] = {};
2598 local_ordinal_type nrows_vals[internal_vector_length] = {};
2599 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2600 const local_ordinal_type partidx = partidxbeg+vv;
2601 ri0_vals[vi] = partptr(partidx);
2602 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2603 }
2604
2605 impl_scalar_type z_partial_sum(0);
2606 if (nrows_vals[0] == 1) {
2607 const local_ordinal_type j=0, pri=pri0;
2608 {
2609 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2610 const local_ordinal_type ri0 = ri0_vals[vi];
2611 const local_ordinal_type nrows = nrows_vals[vi];
2612 if (j < nrows) {
2613 Kokkos::parallel_for
2614 (Kokkos::TeamThreadRange(member, blocksize),
2615 [&](const local_ordinal_type &i) {
2616 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2617 for (local_ordinal_type col=0;col<num_vectors;++col) {
2618 impl_scalar_type &y = Y_scalar_multivector(row,col);
2619 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2620 y += df*yd;
2621
2622 {//if (compute_diff) {
2623 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2624 z_partial_sum += yd_abs*yd_abs;
2625 }
2626 }
2627 });
2628 }
2629 }
2630 }
2631 } else {
2632 Kokkos::parallel_for
2633 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2634 [&](const local_ordinal_type &j) {
2635 const local_ordinal_type pri = pri0 + j;
2636 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2637 const local_ordinal_type ri0 = ri0_vals[vi];
2638 const local_ordinal_type nrows = nrows_vals[vi];
2639 if (j < nrows) {
2640 for (local_ordinal_type col=0;col<num_vectors;++col) {
2641 for (local_ordinal_type i=0;i<blocksize;++i) {
2642 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2643 impl_scalar_type &y = Y_scalar_multivector(row,col);
2644 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2645 y += df*yd;
2646
2647 {//if (compute_diff) {
2648 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2649 z_partial_sum += yd_abs*yd_abs;
2650 }
2651 }
2652 }
2653 }
2654 }
2655 });
2656 }
2657 //if (compute_diff)
2658 Z_scalar_vector(member.league_rank()) += z_partial_sum;
2659 }
2660 }
2661
2665 template<typename WWViewType>
2666 KOKKOS_INLINE_FUNCTION
2667 void
2668 solveSingleVector(const member_type &member,
2669 const local_ordinal_type &blocksize,
2670 const local_ordinal_type &i0,
2671 const local_ordinal_type &r0,
2672 const local_ordinal_type &nrows,
2673 const local_ordinal_type &v,
2674 const WWViewType &WW) const {
2675
2676 typedef SolveTridiagsDefaultModeAndAlgo
2677 <typename execution_space::memory_space> default_mode_and_algo_type;
2678
2679 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2680 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2681
2682 // base pointers
2683 auto A = D_internal_vector_values.data();
2684 auto X = X_internal_vector_values.data();
2685
2686 // constant
2687 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2688 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2689 //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2690
2691 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2692 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2693 const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2694 const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2695 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2696 const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2697
2698 // for multiple rhs
2699 //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2700 //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2701
2702 // move to starting point
2703 A += i0*astep + v;
2704 X += r0*xstep + v;
2705
2706 //for (local_ordinal_type col=0;col<num_vectors;++col)
2707 if (nrows > 1) {
2708 // solve Lx = x
2709 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2710 (default_mode_type,default_algo_type,
2711 member,
2712 KB::Diag::Unit,
2713 blocksize,blocksize,
2714 one,
2715 A, as0, as1,
2716 X, xs0);
2717
2718 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2719 member.team_barrier();
2720 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2721 (default_mode_type,default_algo_type,
2722 member,
2723 blocksize, blocksize,
2724 -one,
2725 A+2*astep, as0, as1,
2726 X, xs0,
2727 one,
2728 X+1*xstep, xs0);
2729 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2730 (default_mode_type,default_algo_type,
2731 member,
2732 KB::Diag::Unit,
2733 blocksize,blocksize,
2734 one,
2735 A+3*astep, as0, as1,
2736 X+1*xstep, xs0);
2737
2738 A += 3*astep;
2739 X += 1*xstep;
2740 }
2741
2742 // solve Ux = x
2743 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2744 (default_mode_type,default_algo_type,
2745 member,
2746 KB::Diag::NonUnit,
2747 blocksize, blocksize,
2748 one,
2749 A, as0, as1,
2750 X, xs0);
2751
2752 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2753 A -= 3*astep;
2754 member.team_barrier();
2755 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2756 (default_mode_type,default_algo_type,
2757 member,
2758 blocksize, blocksize,
2759 -one,
2760 A+1*astep, as0, as1,
2761 X, xs0,
2762 one,
2763 X-1*xstep, xs0);
2764 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2765 (default_mode_type,default_algo_type,
2766 member,
2767 KB::Diag::NonUnit,
2768 blocksize, blocksize,
2769 one,
2770 A, as0, as1,
2771 X-1*xstep,xs0);
2772 X -= 1*xstep;
2773 }
2774 // for multiple rhs
2775 //X += xs1;
2776 } else {
2777 const local_ordinal_type ws0 = WW.stride_0();
2778 auto W = WW.data() + v;
2779 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2780 (default_mode_type,
2781 member, blocksize, X, xs0, W, ws0);
2782 member.team_barrier();
2783 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2784 (default_mode_type,default_algo_type,
2785 member,
2786 blocksize, blocksize,
2787 one,
2788 A, as0, as1,
2789 W, xs0,
2790 zero,
2791 X, xs0);
2792 }
2793 }
2794
2795 template<typename WWViewType>
2796 KOKKOS_INLINE_FUNCTION
2797 void
2798 solveMultiVector(const member_type &member,
2799 const local_ordinal_type &/* blocksize */,
2800 const local_ordinal_type &i0,
2801 const local_ordinal_type &r0,
2802 const local_ordinal_type &nrows,
2803 const local_ordinal_type &v,
2804 const WWViewType &WW) const {
2805
2806 typedef SolveTridiagsDefaultModeAndAlgo
2807 <typename execution_space::memory_space> default_mode_and_algo_type;
2808
2809 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2810 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2811
2812 // constant
2813 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2814 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2815
2816 // subview pattern
2817 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2818 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2819 auto X2 = X1;
2820
2821 local_ordinal_type i = i0, r = r0;
2822
2823
2824 if (nrows > 1) {
2825 // solve Lx = x
2826 KB::Trsm<member_type,
2827 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2828 default_mode_type,default_algo_type>
2829 ::invoke(member, one, A, X1);
2830 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2831 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2832 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2833 member.team_barrier();
2834 KB::Gemm<member_type,
2835 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2836 default_mode_type,default_algo_type>
2837 ::invoke(member, -one, A, X1, one, X2);
2838 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2839 KB::Trsm<member_type,
2840 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2841 default_mode_type,default_algo_type>
2842 ::invoke(member, one, A, X2);
2843 X1.assign_data( X2.data() );
2844 }
2845
2846 // solve Ux = x
2847 KB::Trsm<member_type,
2848 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2849 default_mode_type,default_algo_type>
2850 ::invoke(member, one, A, X1);
2851 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2852 i -= 3;
2853 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2854 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2855 member.team_barrier();
2856 KB::Gemm<member_type,
2857 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2858 default_mode_type,default_algo_type>
2859 ::invoke(member, -one, A, X1, one, X2);
2860
2861 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2862 KB::Trsm<member_type,
2863 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2864 default_mode_type,default_algo_type>
2865 ::invoke(member, one, A, X2);
2866 X1.assign_data( X2.data() );
2867 }
2868 } else {
2869 // matrix is already inverted
2870 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2871 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2872 ::invoke(member, X1, W);
2873 member.team_barrier();
2874 KB::Gemm<member_type,
2875 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2876 default_mode_type,default_algo_type>
2877 ::invoke(member, one, A, W, zero, X1);
2878 }
2879 }
2880
2881 template<int B> struct SingleVectorTag {};
2882 template<int B> struct MultiVectorTag {};
2883
2884 template<int B>
2885 KOKKOS_INLINE_FUNCTION
2886 void
2887 operator() (const SingleVectorTag<B> &, const member_type &member) const {
2888 const local_ordinal_type packidx = member.league_rank();
2889 const local_ordinal_type partidx = packptr(packidx);
2890 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2891 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2892 const local_ordinal_type i0 = pack_td_ptr(partidx);
2893 const local_ordinal_type r0 = part2packrowidx0(partidx);
2894 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2895 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2896 const local_ordinal_type num_vectors = 1;
2897 internal_vector_scratch_type_3d_view
2898 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2899 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2900 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2901 });
2902 Kokkos::parallel_for
2903 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2904 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2905 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2906 });
2907 }
2908
2909 template<int B>
2910 KOKKOS_INLINE_FUNCTION
2911 void
2912 operator() (const MultiVectorTag<B> &, const member_type &member) const {
2913 const local_ordinal_type packidx = member.league_rank();
2914 const local_ordinal_type partidx = packptr(packidx);
2915 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2916 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2917 const local_ordinal_type i0 = pack_td_ptr(partidx);
2918 const local_ordinal_type r0 = part2packrowidx0(partidx);
2919 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2920 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2921 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2922
2923 internal_vector_scratch_type_3d_view
2924 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2925 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2926 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2927 });
2928 Kokkos::parallel_for
2929 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2930 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2931 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2932 });
2933 }
2934
2935 void run(const impl_scalar_type_2d_view_tpetra &Y,
2936 const impl_scalar_type_1d_view &Z) {
2937 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2938 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
2939
2941 this->Y_scalar_multivector = Y;
2942 this->Z_scalar_vector = Z;
2943
2944 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2945 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2946
2947 const local_ordinal_type team_size =
2948 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2949 recommended_team_size(blocksize, vector_length, internal_vector_length);
2950 const int per_team_scratch = internal_vector_scratch_type_3d_view
2951 ::shmem_size(blocksize, num_vectors, vector_loop_size);
2952
2953#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2954#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2955 if (num_vectors == 1) { \
2956 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2957 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2958 Kokkos::parallel_for \
2959 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2960 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2961 } else { \
2962 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2963 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2964 Kokkos::parallel_for \
2965 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2966 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2967 } break
2968#else
2969#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2970 if (num_vectors == 1) { \
2971 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2972 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2973 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2974 Kokkos::parallel_for \
2975 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2976 policy, *this); \
2977 } else { \
2978 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2979 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2980 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2981 Kokkos::parallel_for \
2982 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2983 policy, *this); \
2984 } break
2985#endif
2986 switch (blocksize) {
2987 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2988 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2989 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2990 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2991 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2992 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2993 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2994 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2995 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2996 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2997 }
2998#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2999
3000 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3001 }
3002 };
3003
3007 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
3008 const int team_size) {
3009 int total_team_size(0);
3010 if (blksize <= 5) total_team_size = 32;
3011 else if (blksize <= 9) total_team_size = 32; // 64
3012 else if (blksize <= 12) total_team_size = 96;
3013 else if (blksize <= 16) total_team_size = 128;
3014 else if (blksize <= 20) total_team_size = 160;
3015 else total_team_size = 160;
3016 return total_team_size/team_size;
3017 }
3018
3019 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
3020 const int team_size) {
3021 int total_team_size(0);
3022 if (blksize <= 5) total_team_size = 32;
3023 else if (blksize <= 9) total_team_size = 32; // 64
3024 else if (blksize <= 12) total_team_size = 96;
3025 else if (blksize <= 16) total_team_size = 128;
3026 else if (blksize <= 20) total_team_size = 160;
3027 else total_team_size = 160;
3028 return total_team_size/team_size;
3029 }
3030
3031 template<typename MatrixType>
3032 struct ComputeResidualVector {
3033 public:
3034 using impl_type = ImplType<MatrixType>;
3035 using node_device_type = typename impl_type::node_device_type;
3036 using execution_space = typename impl_type::execution_space;
3037 using memory_space = typename impl_type::memory_space;
3038
3039 using local_ordinal_type = typename impl_type::local_ordinal_type;
3040 using size_type = typename impl_type::size_type;
3041 using impl_scalar_type = typename impl_type::impl_scalar_type;
3042 using magnitude_type = typename impl_type::magnitude_type;
3043 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3044 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3046 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3047 using size_type_1d_view = typename impl_type::size_type_1d_view;
3048 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
3049 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3050 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
3051 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3052 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3053 static constexpr int vector_length = impl_type::vector_length;
3054
3056 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3057
3058 // enum for max blocksize and vector length
3059 enum : int { max_blocksize = 32 };
3060
3061 private:
3062 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
3063 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
3064 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
3065 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
3066 Unmanaged<vector_type_3d_view> y_packed;
3067 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
3068
3069 // AmD information
3070 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
3071 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
3072 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
3073
3074 // block crs graph information
3075 // for cuda (kokkos crs graph uses a different size_type from size_t)
3076 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
3077 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
3078
3079 // blocksize
3080 const local_ordinal_type blocksize_requested;
3081
3082 // part interface
3083 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3084 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3085 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
3086 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3087 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3088 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
3089 const bool is_dm2cm_active;
3090
3091 public:
3092 template<typename LocalCrsGraphType>
3093 ComputeResidualVector(const AmD<MatrixType> &amd,
3094 const LocalCrsGraphType &graph,
3095 const local_ordinal_type &blocksize_requested_,
3096 const PartInterface<MatrixType> &interf,
3097 const local_ordinal_type_1d_view &dm2cm_)
3098 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
3099 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
3100 tpetra_values(amd.tpetra_values),
3101 A_rowptr(graph.row_map),
3102 A_colind(graph.entries),
3103 blocksize_requested(blocksize_requested_),
3104 part2packrowidx0(interf.part2packrowidx0),
3105 part2rowidx0(interf.part2rowidx0),
3106 rowidx2part(interf.rowidx2part),
3107 partptr(interf.partptr),
3108 lclrow(interf.lclrow),
3109 dm2cm(dm2cm_),
3110 is_dm2cm_active(dm2cm_.span() > 0)
3111 {}
3112
3113 inline
3114 void
3115 SerialGemv(const local_ordinal_type &blocksize,
3116 const impl_scalar_type * const KOKKOS_RESTRICT AA,
3117 const impl_scalar_type * const KOKKOS_RESTRICT xx,
3118 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
3119 using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3120 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3121 impl_scalar_type val = 0;
3122#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
3123# pragma ivdep
3124#endif
3125#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
3126# pragma unroll
3127#endif
3128 for (local_ordinal_type k1=0;k1<blocksize;++k1)
3129 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
3130 yy[k0] -= val;
3131 }
3132 }
3133
3134 template<typename bbViewType, typename yyViewType>
3135 KOKKOS_INLINE_FUNCTION
3136 void
3137 VectorCopy(const member_type &member,
3138 const local_ordinal_type &blocksize,
3139 const bbViewType &bb,
3140 const yyViewType &yy) const {
3141 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
3142 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
3143 });
3144 }
3145
3146 template<typename AAViewType, typename xxViewType, typename yyViewType>
3147 KOKKOS_INLINE_FUNCTION
3148 void
3149 TeamVectorGemv(const member_type &member,
3150 const local_ordinal_type &blocksize,
3151 const AAViewType &AA,
3152 const xxViewType &xx,
3153 const yyViewType &yy) const {
3154 Kokkos::parallel_for
3155 (Kokkos::TeamThreadRange(member, blocksize),
3156 [&](const local_ordinal_type &k0) {
3157 impl_scalar_type val = 0;
3158 Kokkos::parallel_for
3159 (Kokkos::ThreadVectorRange(member, blocksize),
3160 [&](const local_ordinal_type &k1) {
3161 val += AA(k0,k1)*xx(k1);
3162 });
3163 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3164 });
3165 }
3166
3167 template<typename AAViewType, typename xxViewType, typename yyViewType>
3168 KOKKOS_INLINE_FUNCTION
3169 void
3170 VectorGemv(const member_type &member,
3171 const local_ordinal_type &blocksize,
3172 const AAViewType &AA,
3173 const xxViewType &xx,
3174 const yyViewType &yy) const {
3175 Kokkos::parallel_for
3176 (Kokkos::ThreadVectorRange(member, blocksize),
3177 [&](const local_ordinal_type &k0) {
3178 impl_scalar_type val(0);
3179 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3180 val += AA(k0,k1)*xx(k1);
3181 }
3182 Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
3183 });
3184 }
3185
3186 // template<typename AAViewType, typename xxViewType, typename yyViewType>
3187 // KOKKOS_INLINE_FUNCTION
3188 // void
3189 // VectorGemv(const member_type &member,
3190 // const local_ordinal_type &blocksize,
3191 // const AAViewType &AA,
3192 // const xxViewType &xx,
3193 // const yyViewType &yy) const {
3194 // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
3195 // impl_scalar_type val = 0;
3196 // Kokkos::parallel_for
3197 // (Kokkos::ThreadVectorRange(member, blocksize),
3198 // [&](const local_ordinal_type &k1) {
3199 // val += AA(k0,k1)*xx(k1);
3200 // });
3201 // Kokkos::atomic_fetch_add(&yy(k0), -val);
3202 // }
3203 // }
3204
3205 struct SeqTag {};
3206
3207 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3208 KOKKOS_INLINE_FUNCTION
3209 void
3210 operator() (const SeqTag &, const local_ordinal_type& i) const {
3211 const local_ordinal_type blocksize = blocksize_requested;
3212 const local_ordinal_type blocksize_square = blocksize*blocksize;
3213
3214 // constants
3215 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3216 const local_ordinal_type num_vectors = y.extent(1);
3217 const local_ordinal_type row = i*blocksize;
3218 for (local_ordinal_type col=0;col<num_vectors;++col) {
3219 // y := b
3220 impl_scalar_type *yy = &y(row, col);
3221 const impl_scalar_type * const bb = &b(row, col);
3222 memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
3223
3224 // y -= Rx
3225 const size_type A_k0 = A_rowptr[i];
3226 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3227 const size_type j = A_k0 + colindsub[k];
3228 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3229 const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
3230 SerialGemv(blocksize,AA,xx,yy);
3231 }
3232 }
3233 }
3234
3235 KOKKOS_INLINE_FUNCTION
3236 void
3237 operator() (const SeqTag &, const member_type &member) const {
3238
3239 // constants
3240 const local_ordinal_type blocksize = blocksize_requested;
3241 const local_ordinal_type blocksize_square = blocksize*blocksize;
3242
3243 const local_ordinal_type lr = member.league_rank();
3244 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3245 const local_ordinal_type num_vectors = y.extent(1);
3246
3247 // subview pattern
3248 auto bb = Kokkos::subview(b, block_range, 0);
3249 auto xx = bb;
3250 auto yy = Kokkos::subview(y, block_range, 0);
3251 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3252
3253 const local_ordinal_type row = lr*blocksize;
3254 for (local_ordinal_type col=0;col<num_vectors;++col) {
3255 // y := b
3256 yy.assign_data(&y(row, col));
3257 bb.assign_data(&b(row, col));
3258 if (member.team_rank() == 0)
3259 VectorCopy(member, blocksize, bb, yy);
3260 member.team_barrier();
3261
3262 // y -= Rx
3263 const size_type A_k0 = A_rowptr[lr];
3264 Kokkos::parallel_for
3265 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3266 [&](const local_ordinal_type &k) {
3267 const size_type j = A_k0 + colindsub[k];
3268 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3269 xx.assign_data( &x(A_colind[j]*blocksize, col) );
3270 VectorGemv(member, blocksize, A_block, xx, yy);
3271 });
3272 }
3273 }
3274
3275 template<int B>
3276 struct AsyncTag {};
3277
3278 template<int B>
3279 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3280 KOKKOS_INLINE_FUNCTION
3281 void
3282 operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
3283 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3284 const local_ordinal_type blocksize_square = blocksize*blocksize;
3285
3286 // constants
3287 const local_ordinal_type partidx = rowidx2part(rowidx);
3288 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3289 const local_ordinal_type v = partidx % vector_length;
3290
3291 const local_ordinal_type num_vectors = y_packed.extent(2);
3292 const local_ordinal_type num_local_rows = lclrow.extent(0);
3293
3294 // temporary buffer for y flat
3295 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3296
3297 const local_ordinal_type lr = lclrow(rowidx);
3298 const local_ordinal_type row = lr*blocksize;
3299 for (local_ordinal_type col=0;col<num_vectors;++col) {
3300 // y := b
3301 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3302
3303 // y -= Rx
3304 const size_type A_k0 = A_rowptr[lr];
3305 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3306 const size_type j = A_k0 + colindsub[k];
3307 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3308 const local_ordinal_type A_colind_at_j = A_colind[j];
3309 if (A_colind_at_j < num_local_rows) {
3310 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3311 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3312 SerialGemv(blocksize, AA,xx,yy);
3313 } else {
3314 const auto loc = A_colind_at_j - num_local_rows;
3315 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3316 SerialGemv(blocksize, AA,xx_remote,yy);
3317 }
3318 }
3319 // move yy to y_packed
3320 for (local_ordinal_type k=0;k<blocksize;++k)
3321 y_packed(pri, k, col)[v] = yy[k];
3322 }
3323 }
3324
3325 template<int B>
3326 KOKKOS_INLINE_FUNCTION
3327 void
3328 operator() (const AsyncTag<B> &, const member_type &member) const {
3329 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3330 const local_ordinal_type blocksize_square = blocksize*blocksize;
3331
3332 // constants
3333 const local_ordinal_type rowidx = member.league_rank();
3334 const local_ordinal_type partidx = rowidx2part(rowidx);
3335 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3336 const local_ordinal_type v = partidx % vector_length;
3337
3338 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3339 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3340 const local_ordinal_type num_local_rows = lclrow.extent(0);
3341
3342 // subview pattern
3343 auto bb = Kokkos::subview(b, block_range, 0);
3344 auto xx = Kokkos::subview(x, block_range, 0);
3345 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3346 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3347 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3348
3349 const local_ordinal_type lr = lclrow(rowidx);
3350 const local_ordinal_type row = lr*blocksize;
3351 for (local_ordinal_type col=0;col<num_vectors;++col) {
3352 // y := b
3353 bb.assign_data(&b(row, col));
3354 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3355 if (member.team_rank() == 0)
3356 VectorCopy(member, blocksize, bb, yy);
3357 member.team_barrier();
3358
3359 // y -= Rx
3360 const size_type A_k0 = A_rowptr[lr];
3361 Kokkos::parallel_for
3362 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3363 [&](const local_ordinal_type &k) {
3364 const size_type j = A_k0 + colindsub[k];
3365 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3366
3367 const local_ordinal_type A_colind_at_j = A_colind[j];
3368 if (A_colind_at_j < num_local_rows) {
3369 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3370 xx.assign_data( &x(loc*blocksize, col) );
3371 VectorGemv(member, blocksize, A_block, xx, yy);
3372 } else {
3373 const auto loc = A_colind_at_j - num_local_rows;
3374 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3375 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3376 }
3377 });
3378 }
3379 }
3380
3381 template <int P, int B> struct OverlapTag {};
3382
3383 template<int P, int B>
3384 // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
3385 KOKKOS_INLINE_FUNCTION
3386 void
3387 operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
3388 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3389 const local_ordinal_type blocksize_square = blocksize*blocksize;
3390
3391 // constants
3392 const local_ordinal_type partidx = rowidx2part(rowidx);
3393 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3394 const local_ordinal_type v = partidx % vector_length;
3395
3396 const local_ordinal_type num_vectors = y_packed.extent(2);
3397 const local_ordinal_type num_local_rows = lclrow.extent(0);
3398
3399 // temporary buffer for y flat
3400 impl_scalar_type yy[max_blocksize] = {};
3401
3402 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3403 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3404
3405 const local_ordinal_type lr = lclrow(rowidx);
3406 const local_ordinal_type row = lr*blocksize;
3407 for (local_ordinal_type col=0;col<num_vectors;++col) {
3408 if (P == 0) {
3409 // y := b
3410 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
3411 } else {
3412 // y (temporary) := 0
3413 memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
3414 }
3415
3416 // y -= Rx
3417 const size_type A_k0 = A_rowptr[lr];
3418 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3419 const size_type j = A_k0 + colindsub_used[k];
3420 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
3421 const local_ordinal_type A_colind_at_j = A_colind[j];
3422 if (P == 0) {
3423 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3424 const impl_scalar_type * const xx = &x(loc*blocksize, col);
3425 SerialGemv(blocksize,AA,xx,yy);
3426 } else {
3427 const auto loc = A_colind_at_j - num_local_rows;
3428 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
3429 SerialGemv(blocksize,AA,xx_remote,yy);
3430 }
3431 }
3432 // move yy to y_packed
3433 if (P == 0) {
3434 for (local_ordinal_type k=0;k<blocksize;++k)
3435 y_packed(pri, k, col)[v] = yy[k];
3436 } else {
3437 for (local_ordinal_type k=0;k<blocksize;++k)
3438 y_packed(pri, k, col)[v] += yy[k];
3439 }
3440 }
3441 }
3442
3443 template<int P, int B>
3444 KOKKOS_INLINE_FUNCTION
3445 void
3446 operator() (const OverlapTag<P,B> &, const member_type &member) const {
3447 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3448 const local_ordinal_type blocksize_square = blocksize*blocksize;
3449
3450 // constants
3451 const local_ordinal_type rowidx = member.league_rank();
3452 const local_ordinal_type partidx = rowidx2part(rowidx);
3453 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3454 const local_ordinal_type v = partidx % vector_length;
3455
3456 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3457 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3458 const local_ordinal_type num_local_rows = lclrow.extent(0);
3459
3460 // subview pattern
3461 auto bb = Kokkos::subview(b, block_range, 0);
3462 auto xx = bb; //Kokkos::subview(x, block_range, 0);
3463 auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3464 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3465 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3466 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3467 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3468
3469 const local_ordinal_type lr = lclrow(rowidx);
3470 const local_ordinal_type row = lr*blocksize;
3471 for (local_ordinal_type col=0;col<num_vectors;++col) {
3472 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3473 if (P == 0) {
3474 // y := b
3475 bb.assign_data(&b(row, col));
3476 if (member.team_rank() == 0)
3477 VectorCopy(member, blocksize, bb, yy);
3478 member.team_barrier();
3479 }
3480
3481 // y -= Rx
3482 const size_type A_k0 = A_rowptr[lr];
3483 Kokkos::parallel_for
3484 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3485 [&](const local_ordinal_type &k) {
3486 const size_type j = A_k0 + colindsub_used[k];
3487 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3488
3489 const local_ordinal_type A_colind_at_j = A_colind[j];
3490 if (P == 0) {
3491 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3492 xx.assign_data( &x(loc*blocksize, col) );
3493 VectorGemv(member, blocksize, A_block, xx, yy);
3494 } else {
3495 const auto loc = A_colind_at_j - num_local_rows;
3496 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3497 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3498 }
3499 });
3500 }
3501 }
3502
3503 // y = b - Rx; seq method
3504 template<typename MultiVectorLocalViewTypeY,
3505 typename MultiVectorLocalViewTypeB,
3506 typename MultiVectorLocalViewTypeX>
3507 void run(const MultiVectorLocalViewTypeY &y_,
3508 const MultiVectorLocalViewTypeB &b_,
3509 const MultiVectorLocalViewTypeX &x_) {
3510 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3511 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
3512
3513 y = y_; b = b_; x = x_;
3514 if (is_cuda<execution_space>::value) {
3515#if defined(KOKKOS_ENABLE_CUDA)
3516 const local_ordinal_type blocksize = blocksize_requested;
3517 const local_ordinal_type team_size = 8;
3518 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3519 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3520 Kokkos::parallel_for
3521 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3522#endif
3523 } else if(is_hip<execution_space>::value) {
3524#if defined(KOKKOS_ENABLE_HIP)
3525 const local_ordinal_type blocksize = blocksize_requested;
3526 const local_ordinal_type team_size = 8;
3527 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3528 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3529 Kokkos::parallel_for
3530 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3531#endif
3532 } else {
3533#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3534 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3535#else
3536 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3537 Kokkos::parallel_for
3538 ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3539#endif
3540 }
3541 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3542 }
3543
3544 // y = b - R (x , x_remote)
3545 template<typename MultiVectorLocalViewTypeB,
3546 typename MultiVectorLocalViewTypeX,
3547 typename MultiVectorLocalViewTypeX_Remote>
3548 void run(const vector_type_3d_view &y_packed_,
3549 const MultiVectorLocalViewTypeB &b_,
3550 const MultiVectorLocalViewTypeX &x_,
3551 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3552 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3553 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
3554
3555 b = b_; x = x_; x_remote = x_remote_;
3556 if (is_cuda<execution_space>::value) {
3557#if defined(KOKKOS_ENABLE_CUDA)
3558 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3559 y_packed_.extent(0),
3560 y_packed_.extent(1),
3561 y_packed_.extent(2),
3562 vector_length);
3563#endif
3564 } else if (is_hip<execution_space>::value) {
3565#if defined(KOKKOS_ENABLE_HIP)
3566 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3567 y_packed_.extent(0),
3568 y_packed_.extent(1),
3569 y_packed_.extent(2),
3570 vector_length);
3571#endif
3572 } else {
3573 y_packed = y_packed_;
3574 }
3575
3576 if (is_cuda<execution_space>::value) {
3577#if defined(KOKKOS_ENABLE_CUDA)
3578 const local_ordinal_type blocksize = blocksize_requested;
3579 const local_ordinal_type team_size = 8;
3580 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3581 // local_ordinal_type vl_power_of_two = 1;
3582 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3583 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3584 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3585#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3586 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3587 policy(rowidx2part.extent(0), team_size, vector_size); \
3588 Kokkos::parallel_for \
3589 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3590 policy, *this); } break
3591 switch (blocksize_requested) {
3592 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3593 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3594 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3595 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3596 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3597 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3598 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3599 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3600 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3601 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3602 }
3603#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3604#endif
3605 } else if (is_hip<execution_space>::value) {
3606#if defined(KOKKOS_ENABLE_HIP)
3607 const local_ordinal_type blocksize = blocksize_requested;
3608 const local_ordinal_type team_size = 8;
3609 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3610 // local_ordinal_type vl_power_of_two = 1;
3611 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3612 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3613 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3614#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3615 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3616 policy(rowidx2part.extent(0), team_size, vector_size); \
3617 Kokkos::parallel_for \
3618 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3619 policy, *this); } break
3620 switch (blocksize_requested) {
3621 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3622 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3623 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3624 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3625 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3626 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3627 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3628 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3629 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3630 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3631 }
3632#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3633#endif
3634 } else {
3635#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3636 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3637#else
3638#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3639 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3640 Kokkos::parallel_for \
3641 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3642 policy, *this); } break
3643 switch (blocksize_requested) {
3644 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3645 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3646 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3647 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3648 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3649 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3650 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3651 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3652 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3653 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3654 }
3655#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3656#endif
3657 }
3658 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3659 }
3660
3661 // y = b - R (y , y_remote)
3662 template<typename MultiVectorLocalViewTypeB,
3663 typename MultiVectorLocalViewTypeX,
3664 typename MultiVectorLocalViewTypeX_Remote>
3665 void run(const vector_type_3d_view &y_packed_,
3666 const MultiVectorLocalViewTypeB &b_,
3667 const MultiVectorLocalViewTypeX &x_,
3668 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3669 const bool compute_owned) {
3670 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3671 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
3672
3673 b = b_; x = x_; x_remote = x_remote_;
3674 if (is_cuda<execution_space>::value) {
3675#if defined(KOKKOS_ENABLE_CUDA)
3676 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3677 y_packed_.extent(0),
3678 y_packed_.extent(1),
3679 y_packed_.extent(2),
3680 vector_length);
3681#endif
3682 } else if (is_hip<execution_space>::value) {
3683#if defined(KOKKOS_ENABLE_HIP)
3684 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3685 y_packed_.extent(0),
3686 y_packed_.extent(1),
3687 y_packed_.extent(2),
3688 vector_length);
3689#endif
3690 } else {
3691 y_packed = y_packed_;
3692 }
3693
3694 if (is_cuda<execution_space>::value) {
3695#if defined(KOKKOS_ENABLE_CUDA)
3696 const local_ordinal_type blocksize = blocksize_requested;
3697 const local_ordinal_type team_size = 8;
3698 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3699 // local_ordinal_type vl_power_of_two = 1;
3700 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3701 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3702 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3703#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3704 if (compute_owned) { \
3705 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3706 policy(rowidx2part.extent(0), team_size, vector_size); \
3707 Kokkos::parallel_for \
3708 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3709 } else { \
3710 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3711 policy(rowidx2part.extent(0), team_size, vector_size); \
3712 Kokkos::parallel_for \
3713 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3714 } break
3715 switch (blocksize_requested) {
3716 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3717 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3718 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3719 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3720 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3721 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3722 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3723 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3724 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3725 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3726 }
3727#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3728#endif
3729 } else if (is_hip<execution_space>::value) {
3730#if defined(KOKKOS_ENABLE_HIP)
3731 const local_ordinal_type blocksize = blocksize_requested;
3732 const local_ordinal_type team_size = 8;
3733 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedHIPVectorSize(blocksize, team_size);
3734 // local_ordinal_type vl_power_of_two = 1;
3735 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3736 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3737 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3738#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3739 if (compute_owned) { \
3740 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3741 policy(rowidx2part.extent(0), team_size, vector_size); \
3742 Kokkos::parallel_for \
3743 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3744 } else { \
3745 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3746 policy(rowidx2part.extent(0), team_size, vector_size); \
3747 Kokkos::parallel_for \
3748 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3749 } break
3750 switch (blocksize_requested) {
3751 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3752 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3753 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3754 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3755 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3756 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3757 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3758 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3759 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3760 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3761 }
3762#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3763#endif
3764 } else {
3765#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
3766 TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: device compiler should not see this code");
3767#else
3768#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3769 if (compute_owned) { \
3770 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3771 policy(0, rowidx2part.extent(0)); \
3772 Kokkos::parallel_for \
3773 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3774 } else { \
3775 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3776 policy(0, rowidx2part.extent(0)); \
3777 Kokkos::parallel_for \
3778 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3779 } break
3780
3781 switch (blocksize_requested) {
3782 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3783 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3784 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3785 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3786 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3787 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3788 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3789 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3790 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3791 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3792 }
3793#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3794#endif
3795 }
3796 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3797 }
3798 };
3799
3800 template<typename MatrixType>
3801 void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3802 /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3803 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3804 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
3805
3806 using impl_type = ImplType<MatrixType>;
3807 using local_ordinal_type = typename impl_type::local_ordinal_type;
3808 using impl_scalar_type = typename impl_type::impl_scalar_type;
3809#if 0
3810 const auto norm2 = KokkosBlas::nrm1(zz);
3811#else
3812 impl_scalar_type norm2(0);
3813 Kokkos::parallel_reduce
3814 ("ReduceMultiVector::Device",
3815 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3816 KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3817 update += zz(i);
3818 }, norm2);
3819#endif
3820 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3821
3822 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3823 }
3824
3828 template<typename MatrixType>
3830 public:
3832 using host_execution_space = typename impl_type::host_execution_space;
3833 using magnitude_type = typename impl_type::magnitude_type;
3834
3835 private:
3836 bool collective_;
3837 int sweep_step_, sweep_step_upper_bound_;
3838#ifdef HAVE_IFPACK2_MPI
3839 MPI_Request mpi_request_;
3840 MPI_Comm comm_;
3841#endif
3842 magnitude_type work_[3];
3843
3844 public:
3845 NormManager() = default;
3846 NormManager(const NormManager &b) = default;
3847 NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3848 sweep_step_ = 1;
3849 sweep_step_upper_bound_ = 1;
3850 collective_ = comm->getSize() > 1;
3851 if (collective_) {
3852#ifdef HAVE_IFPACK2_MPI
3853 const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3854 TEUCHOS_ASSERT( ! mpi_comm.is_null());
3855 comm_ = *mpi_comm->getRawMpiComm();
3856#endif
3857 }
3858 const magnitude_type zero(0), minus_one(-1);
3859 work_[0] = zero;
3860 work_[1] = zero;
3861 work_[2] = minus_one;
3862 }
3863
3864 // Check the norm every sweep_step sweeps.
3865 void setCheckFrequency(const int sweep_step) {
3866 TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3867 sweep_step_upper_bound_ = sweep_step;
3868 sweep_step_ = 1;
3869 }
3870
3871 // Get the buffer into which to store rank-local squared norms.
3872 magnitude_type* getBuffer() { return &work_[0]; }
3873
3874 // Call MPI_Iallreduce to find the global squared norms.
3875 void ireduce(const int sweep, const bool force = false) {
3876 if ( ! force && sweep % sweep_step_) return;
3877
3878 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3879
3880 work_[1] = work_[0];
3881#ifdef HAVE_IFPACK2_MPI
3882 auto send_data = &work_[1];
3883 auto recv_data = &work_[0];
3884 if (collective_) {
3885# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3886 MPI_Iallreduce(send_data, recv_data, 1,
3887 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3888 MPI_SUM, comm_, &mpi_request_);
3889# else
3890 MPI_Allreduce (send_data, recv_data, 1,
3891 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3892 MPI_SUM, comm_);
3893# endif
3894 }
3895#endif
3896 }
3897
3898 // Check if the norm-based termination criterion is met. tol2 is the
3899 // tolerance squared. Sweep is the sweep index. If not every iteration is
3900 // being checked, this function immediately returns false. If a check must
3901 // be done at this iteration, it waits for the reduction triggered by
3902 // ireduce to complete, then checks the global norm against the tolerance.
3903 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3904 // early return
3905 if (sweep <= 0) return false;
3906
3907 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3908
3909 TEUCHOS_ASSERT(sweep >= 1);
3910 if ( ! force && (sweep - 1) % sweep_step_) return false;
3911 if (collective_) {
3912#ifdef HAVE_IFPACK2_MPI
3913# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3914 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3915# else
3916 // Do nothing.
3917# endif
3918#endif
3919 }
3920 bool r_val = false;
3921 if (sweep == 1) {
3922 work_[2] = work_[0];
3923 } else {
3924 r_val = (work_[0] < tol2*work_[2]);
3925 }
3926
3927 // adjust sweep step
3928 const auto adjusted_sweep_step = 2*sweep_step_;
3929 if (adjusted_sweep_step < sweep_step_upper_bound_) {
3930 sweep_step_ = adjusted_sweep_step;
3931 } else {
3932 sweep_step_ = sweep_step_upper_bound_;
3933 }
3934 return r_val;
3935 }
3936
3937 // After termination has occurred, finalize the norms for use in
3938 // get_norms{0,final}.
3939 void finalize () {
3940 work_[0] = std::sqrt(work_[0]); // after converged
3941 if (work_[2] >= 0)
3942 work_[2] = std::sqrt(work_[2]); // first norm
3943 // if work_[2] is minus one, then norm is not requested.
3944 }
3945
3946 // Report norms to the caller.
3947 const magnitude_type getNorms0 () const { return work_[2]; }
3948 const magnitude_type getNormsFinal () const { return work_[0]; }
3949 };
3950
3954 template<typename MatrixType>
3955 int
3956 applyInverseJacobi(// importer
3957 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3958 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3959 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3960 const bool overlap_communication_and_computation,
3961 // tpetra interface
3962 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3963 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3964 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3965 /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3966 // local object interface
3967 const PartInterface<MatrixType> &interf, // mesh interface
3968 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3969 const AmD<MatrixType> &amd, // R = A - D
3970 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3971 /* */ NormManager<MatrixType> &norm_manager,
3972 // preconditioner parameters
3973 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3974 /* */ bool is_y_zero,
3975 const int max_num_sweeps,
3976 const typename ImplType<MatrixType>::magnitude_type tol,
3977 const int check_tol_every) {
3978 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3979
3980 using impl_type = ImplType<MatrixType>;
3981 using node_memory_space = typename impl_type::node_memory_space;
3982 using local_ordinal_type = typename impl_type::local_ordinal_type;
3983 using size_type = typename impl_type::size_type;
3984 using impl_scalar_type = typename impl_type::impl_scalar_type;
3985 using magnitude_type = typename impl_type::magnitude_type;
3986 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3987 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3988 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3989 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3990
3991 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3992
3993 // either tpetra importer or async importer must be active
3994 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3995 "Neither Tpetra importer nor Async importer is null.");
3996 // max number of sweeps should be positive number
3997 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3998 "Maximum number of sweeps must be >= 1.");
3999
4000 // const parameters
4001 const bool is_seq_method_requested = !tpetra_importer.is_null();
4002 const bool is_async_importer_active = !async_importer.is_null();
4003 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4004 const magnitude_type tolerance = tol*tol;
4005 const local_ordinal_type blocksize = btdm.values.extent(1);
4006 const local_ordinal_type num_vectors = Y.getNumVectors();
4007 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4008
4009 const impl_scalar_type zero(0.0);
4010
4011 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4012 "The seq method for applyInverseJacobi, " <<
4013 "which in any case is for developer use only, " <<
4014 "does not support norm-based termination.");
4015 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4016 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4017 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4018 std::invalid_argument,
4019 "The seq method for applyInverseJacobi, " <<
4020 "which in any case is for developer use only, " <<
4021 "only supports memory spaces accessible from host.");
4022
4023 // if workspace is needed more, resize it
4024 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4025 if (work.span() < work_span_required)
4026 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4027
4028 // construct W
4029 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4030 if (local_ordinal_type(W.extent(0)) < W_size)
4031 W = impl_scalar_type_1d_view("W", W_size);
4032
4033 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4034 {
4035 if (is_seq_method_requested) {
4036 if (Z.getNumVectors() != Y.getNumVectors())
4037 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4038 } else {
4039 if (is_async_importer_active) {
4040 // create comm data buffer and keep it here
4041 async_importer->createDataBuffer(num_vectors);
4042 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4043 }
4044 }
4045 }
4046
4047 // wrap the workspace with 3d view
4048 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4049 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4050 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4051 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4052 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4053
4054 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4055 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4056 damping_factor, is_norm_manager_active);
4057
4058 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4059 ComputeResidualVector<MatrixType>
4060 compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4061 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4062
4063 // norm manager workspace resize
4064 if (is_norm_manager_active)
4065 norm_manager.setCheckFrequency(check_tol_every);
4066
4067 // iterate
4068 int sweep = 0;
4069 for (;sweep<max_num_sweeps;++sweep) {
4070 {
4071 if (is_y_zero) {
4072 // pmv := x(lclrow)
4073 multivector_converter.run(XX);
4074 } else {
4075 if (is_seq_method_requested) {
4076 // SEQ METHOD IS TESTING ONLY
4077
4078 // y := x - R y
4079 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4080 compute_residual_vector.run(YY, XX, ZZ);
4081
4082 // pmv := y(lclrow).
4083 multivector_converter.run(YY);
4084 } else {
4085 // fused y := x - R y and pmv := y(lclrow);
4086 // real use case does not use overlap comp and comm
4087 if (overlap_communication_and_computation || !is_async_importer_active) {
4088 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4089 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
4090 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4091 if (is_async_importer_active) async_importer->cancel();
4092 break;
4093 }
4094 if (is_async_importer_active) {
4095 async_importer->syncRecv();
4096 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
4097 }
4098 } else {
4099 if (is_async_importer_active)
4100 async_importer->syncExchange(YY);
4101 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
4102 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4103 }
4104 }
4105 }
4106 }
4107
4108 // pmv := inv(D) pmv.
4109 {
4110 solve_tridiags.run(YY, W);
4111 }
4112 {
4113 if (is_norm_manager_active) {
4114 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
4115 reduceVector<MatrixType>(W, norm_manager.getBuffer());
4116 if (sweep + 1 == max_num_sweeps) {
4117 norm_manager.ireduce(sweep, true);
4118 norm_manager.checkDone(sweep + 1, tolerance, true);
4119 } else {
4120 norm_manager.ireduce(sweep);
4121 }
4122 }
4123 }
4124 is_y_zero = false;
4125 }
4126
4127 //sqrt the norms for the caller's use.
4128 if (is_norm_manager_active) norm_manager.finalize();
4129
4130 return sweep;
4131 }
4132
4133
4134 template<typename MatrixType>
4135 struct ImplObject {
4136 using impl_type = ImplType<MatrixType>;
4137 using part_interface_type = PartInterface<MatrixType>;
4138 using block_tridiags_type = BlockTridiags<MatrixType>;
4139 using amd_type = AmD<MatrixType>;
4140 using norm_manager_type = NormManager<MatrixType>;
4141 using async_import_type = AsyncableImport<MatrixType>;
4142
4143 // distructed objects
4144 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
4145 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
4146 Teuchos::RCP<async_import_type> async_importer;
4147 bool overlap_communication_and_computation;
4148
4149 // copy of Y (mutable to penentrate const)
4150 mutable typename impl_type::tpetra_multivector_type Z;
4151 mutable typename impl_type::impl_scalar_type_1d_view W;
4152
4153 // local objects
4154 part_interface_type part_interface;
4155 block_tridiags_type block_tridiags; // D
4156 amd_type a_minus_d; // R = A - D
4157 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
4158 mutable norm_manager_type norm_manager;
4159 };
4160
4161 } // namespace BlockTriDiContainerDetails
4162
4163} // namespace Ifpack2
4164
4165#endif
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:423
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1346
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1021
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2234
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1135
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3007
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3956
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:121
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1533
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:240
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:74
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1503
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:252
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:178
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1302
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:204
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:330
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:352
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:343
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:399
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:386
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:334
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:357
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2247
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3829
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2396
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:279
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:187
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:195