Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45#include "Tpetra_RowMatrixTransposer.hpp"
46
47namespace Tpetra {
48
49namespace MatrixMatrix{
50
51namespace ExtraKernels{
52
53
54// Double product version
55template<class CrsMatrixType>
56size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
57 // Follows the NZ estimate in ML's ml_matmatmult.c
58 size_t Aest = 100, Best=100;
59 if (A.getLocalNumEntries() > 0)
60 Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
61 if (B.getLocalNumEntries() > 0)
62 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
63
64 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
65 nnzperrow *= nnzperrow;
66
67 return nnzperrow;
68}
69
70
71// Triple product version
72template<class CrsMatrixType>
73size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74 size_t nnzPerRowA = 100, Pcols = 100;
75 if (A.getLocalNumEntries() > 0)
76 nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
77 if (P.getLocalNumEntries() > 0)
78 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
79 return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
80}
81
82#if defined (HAVE_TPETRA_INST_OPENMP)
83/*********************************************************************************************************/
84template<class Scalar,
85 class LocalOrdinal,
86 class GlobalOrdinal,
87 class LocalOrdinalViewType>
88void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
90 const LocalOrdinalViewType & targetMapToOrigRow,
91 const LocalOrdinalViewType & targetMapToImportRow,
92 const LocalOrdinalViewType & Bcol2Ccol,
93 const LocalOrdinalViewType & Icol2Ccol,
94 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
95 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
96 const std::string& label,
97 const Teuchos::RCP<Teuchos::ParameterList>& params) {
98#ifdef HAVE_TPETRA_MMM_TIMINGS
99 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
100 using Teuchos::TimeMonitor;
101 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
102#endif
103 using Teuchos::Array;
104 using Teuchos::ArrayRCP;
105 using Teuchos::ArrayView;
106 using Teuchos::RCP;
107 using Teuchos::rcp;
108
109
110 // Lots and lots of typedefs
112 // typedef typename KCRS::device_type device_t;
113 typedef typename KCRS::StaticCrsGraphType graph_t;
114 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
115 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
116 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
117 typedef typename KCRS::values_type::non_const_type scalar_view_t;
118
119 // Unmanaged versions of the above
120 //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
121 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
122 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
123
124 typedef Scalar SC;
125 typedef LocalOrdinal LO;
126 typedef GlobalOrdinal GO;
127 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
128 typedef Map<LO,GO,NO> map_type;
129
130 // NOTE (mfh 15 Sep 2017) This is specifically only for
131 // execution_space = Kokkos::OpenMP, so we neither need nor want
132 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
133 typedef NO::execution_space execution_space;
134 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
135
136 // All of the invalid guys
137 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
138 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
139 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
140
141 // Grab the Kokkos::SparseCrsMatrices & inner stuff
142 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
143 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
144
145 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
146 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
147 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
148 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
149
150 c_lno_view_t Irowptr;
151 lno_nnz_view_t Icolind;
152 scalar_view_t Ivals;
153 if(!Bview.importMatrix.is_null()) {
154 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
155 Irowptr = lclB.graph.row_map;
156 Icolind = lclB.graph.entries;
157 Ivals = lclB.values;
158 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
159 }
160
161 // Sizes
162 RCP<const map_type> Ccolmap = C.getColMap();
163 size_t m = Aview.origMatrix->getLocalNumRows();
164 size_t n = Ccolmap->getLocalNumElements();
165 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
166
167 // Get my node / thread info (right from openmp or parameter list)
168 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
169 if(!params.is_null()) {
170 if(params->isParameter("openmp: ltg thread max"))
171 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
172 }
173
174 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
175 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
176 // we will not touch these until the final copyout step
177 lno_nnz_view_t entriesC;
178 scalar_view_t valuesC;
179
180 // add this, since we do the rowptr in place, we could figure this out
181 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
182 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
183
184 // Thread-local memory
185 Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
186 Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
187
188 double thread_chunk = (double)(m) / thread_max;
189
190 // Run chunks of the matrix independently
191 Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
192 {
193 // Thread coordination stuff
194 size_t my_thread_start = tid * thread_chunk;
195 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
196 size_t my_thread_m = my_thread_stop - my_thread_start;
197
198 // Size estimate
199 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
200
201 // Allocations
202 std::vector<size_t> c_status(n,INVALID);
203
204 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
205 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
206
207 // For each row of A/C
208 size_t CSR_ip = 0, OLD_ip = 0;
209 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
210 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
211 // on the calling process.
212 // JJE 10 Apr 2019 index directly into the rowptr
213 row_mapC(i) = CSR_ip;
214
215 // mfh 27 Sep 2016: For each entry of A in the current row of A
216 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
217 LO Aik = Acolind(k); // local column index of current entry of A
218 const SC Aval = Avals(k); // value of current entry of A
219 if (Aval == SC_ZERO)
220 continue; // skip explicitly stored zero values in A
221
222 if (targetMapToOrigRow(Aik) != LO_INVALID) {
223 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
224 // corresponding to the current entry of A is populated, then
225 // the corresponding row of B is in B_local (i.e., it lives on
226 // the calling process).
227
228 // Local matrix
229 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
230
231 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
232 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
233 LO Bkj = Bcolind(j);
234 LO Cij = Bcol2Ccol(Bkj);
235
236 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
237 // New entry
238 c_status[Cij] = CSR_ip;
239 Ccolind(CSR_ip) = Cij;
240 Cvals(CSR_ip) = Aval*Bvals(j);
241 CSR_ip++;
242
243 } else {
244 Cvals(c_status[Cij]) += Aval*Bvals(j);
245 }
246 }
247
248 } else {
249 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
250 // corresponding to the current entry of A NOT populated (has
251 // a flag "invalid" value), then the corresponding row of B is
252 // in B_local (i.e., it lives on the calling process).
253
254 // Remote matrix
255 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
256 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
257 LO Ikj = Icolind(j);
258 LO Cij = Icol2Ccol(Ikj);
259
260 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
261 // New entry
262 c_status[Cij] = CSR_ip;
263 Ccolind(CSR_ip) = Cij;
264 Cvals(CSR_ip) = Aval*Ivals(j);
265 CSR_ip++;
266
267 } else {
268 Cvals(c_status[Cij]) += Aval*Ivals(j);
269 }
270 }
271 }
272 }
273
274 // Resize for next pass if needed
275 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
276 CSR_alloc *= 2;
277 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
278 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
279 }
280 OLD_ip = CSR_ip;
281 }
282 thread_total_nnz(tid) = CSR_ip;
283 tl_colind(tid) = Ccolind;
284 tl_values(tid) = Cvals;
285 });
286
287 // Do the copy out
288 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
289
290#ifdef HAVE_TPETRA_MMM_TIMINGS
291 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
292#endif
293 // Sort & set values
294 if (params.is_null() || params->get("sort entries",true))
295 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
296 C.setAllValues(row_mapC,entriesC,valuesC);
297
298}
299
300/*********************************************************************************************************/
301template<class Scalar,
302 class LocalOrdinal,
303 class GlobalOrdinal,
304 class LocalOrdinalViewType>
305void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
306 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
307 const LocalOrdinalViewType & targetMapToOrigRow,
308 const LocalOrdinalViewType & targetMapToImportRow,
309 const LocalOrdinalViewType & Bcol2Ccol,
310 const LocalOrdinalViewType & Icol2Ccol,
311 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
312 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
313 const std::string& label,
314 const Teuchos::RCP<Teuchos::ParameterList>& params) {
315#ifdef HAVE_TPETRA_MMM_TIMINGS
316 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
317 using Teuchos::TimeMonitor;
318 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
319#endif
320
321 using Teuchos::Array;
322 using Teuchos::ArrayRCP;
323 using Teuchos::ArrayView;
324 using Teuchos::RCP;
325 using Teuchos::rcp;
326
327 // Lots and lots of typedefs
329 // typedef typename KCRS::device_type device_t;
330 typedef typename KCRS::StaticCrsGraphType graph_t;
331 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
332 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
333 typedef typename KCRS::values_type::non_const_type scalar_view_t;
334
335 typedef Scalar SC;
336 typedef LocalOrdinal LO;
337 typedef GlobalOrdinal GO;
338 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
339 typedef Map<LO,GO,NO> map_type;
340
341 // NOTE (mfh 15 Sep 2017) This is specifically only for
342 // execution_space = Kokkos::OpenMP, so we neither need nor want
343 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
344 typedef NO::execution_space execution_space;
345 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
346
347 // All of the invalid guys
348 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
349 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
350 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
351
352 // Grab the Kokkos::SparseCrsMatrices & inner stuff
353 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
354 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
355 const KCRS & Cmat = C.getLocalMatrixDevice();
356
357 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
358 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
359 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
360 scalar_view_t Cvals = Cmat.values;
361
362 c_lno_view_t Irowptr;
363 c_lno_nnz_view_t Icolind;
364 scalar_view_t Ivals;
365 if(!Bview.importMatrix.is_null()) {
366 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
367 Irowptr = lclB.graph.row_map;
368 Icolind = lclB.graph.entries;
369 Ivals = lclB.values;
370 }
371
372 // Sizes
373 RCP<const map_type> Ccolmap = C.getColMap();
374 size_t m = Aview.origMatrix->getLocalNumRows();
375 size_t n = Ccolmap->getLocalNumElements();
376
377 // Get my node / thread info (right from openmp or parameter list)
378 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
379 if(!params.is_null()) {
380 if(params->isParameter("openmp: ltg thread max"))
381 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
382 }
383
384 double thread_chunk = (double)(m) / thread_max;
385
386 // Run chunks of the matrix independently
387 Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
388 {
389 // Thread coordination stuff
390 size_t my_thread_start = tid * thread_chunk;
391 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
392
393 // Allocations
394 std::vector<size_t> c_status(n,INVALID);
395
396 // For each row of A/C
397 size_t CSR_ip = 0, OLD_ip = 0;
398 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
399 // First fill the c_status array w/ locations where we're allowed to
400 // generate nonzeros for this row
401 OLD_ip = Crowptr(i);
402 CSR_ip = Crowptr(i+1);
403 for (size_t k = OLD_ip; k < CSR_ip; k++) {
404 c_status[Ccolind(k)] = k;
405 // Reset values in the row of C
406 Cvals(k) = SC_ZERO;
407 }
408
409 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
410 LO Aik = Acolind(k);
411 const SC Aval = Avals(k);
412 if (Aval == SC_ZERO)
413 continue;
414
415 if (targetMapToOrigRow(Aik) != LO_INVALID) {
416 // Local matrix
417 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
418
419 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
420 LO Bkj = Bcolind(j);
421 LO Cij = Bcol2Ccol(Bkj);
422
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
425 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
426
427 Cvals(c_status[Cij]) += Aval * Bvals(j);
428 }
429 } else {
430 // Remote matrix
431 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
432 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
433 LO Ikj = Icolind(j);
434 LO Cij = Icol2Ccol(Ikj);
435
436 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
437 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
438 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
439
440 Cvals(c_status[Cij]) += Aval * Ivals(j);
441 }
442 }
443 }
444 }
445 });
446
447 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
448}
449
450/*********************************************************************************************************/
451template<class Scalar,
452 class LocalOrdinal,
453 class GlobalOrdinal,
454 class LocalOrdinalViewType>
455void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
456 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
457 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
458 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
459 const LocalOrdinalViewType & targetMapToOrigRow,
460 const LocalOrdinalViewType & targetMapToImportRow,
461 const LocalOrdinalViewType & Bcol2Ccol,
462 const LocalOrdinalViewType & Icol2Ccol,
463 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
464 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
465 const std::string& label,
466 const Teuchos::RCP<Teuchos::ParameterList>& params) {
467#ifdef HAVE_TPETRA_MMM_TIMINGS
468 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
469 using Teuchos::TimeMonitor;
470 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
471#endif
472
473 using Teuchos::Array;
474 using Teuchos::ArrayRCP;
475 using Teuchos::ArrayView;
476 using Teuchos::RCP;
477 using Teuchos::rcp;
478
479 // Lots and lots of typedefs
480 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
482 // typedef typename KCRS::device_type device_t;
483 typedef typename KCRS::StaticCrsGraphType graph_t;
484 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
485 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
486 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
487 typedef typename KCRS::values_type::non_const_type scalar_view_t;
488
489 // Unmanaged versions of the above
490 //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
491 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
492 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
493
494 // Jacobi-specific
495 typedef typename scalar_view_t::memory_space scalar_memory_space;
496
497 typedef Scalar SC;
498 typedef LocalOrdinal LO;
499 typedef GlobalOrdinal GO;
500 typedef Node NO;
501 typedef Map<LO,GO,NO> map_type;
502
503 // NOTE (mfh 15 Sep 2017) This is specifically only for
504 // execution_space = Kokkos::OpenMP, so we neither need nor want
505 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
506 typedef NO::execution_space execution_space;
507 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
508
509 // All of the invalid guys
510 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
511 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
512 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
513
514 // Grab the Kokkos::SparseCrsMatrices & inner stuff
515 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
516 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
517
518 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
519 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
520 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
521 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
522
523 c_lno_view_t Irowptr;
524 lno_nnz_view_t Icolind;
525 scalar_view_t Ivals;
526 if(!Bview.importMatrix.is_null()) {
527 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
528 Irowptr = lclB.graph.row_map;
529 Icolind = lclB.graph.entries;
530 Ivals = lclB.values;
531 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
532 }
533
534 // Jacobi-specific inner stuff
535 auto Dvals =
536 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
537
538 // Sizes
539 RCP<const map_type> Ccolmap = C.getColMap();
540 size_t m = Aview.origMatrix->getLocalNumRows();
541 size_t n = Ccolmap->getLocalNumElements();
542 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
543
544 // Get my node / thread info (right from openmp)
545 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
546 if(!params.is_null()) {
547 if(params->isParameter("openmp: ltg thread max"))
548 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
549 }
550
551 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
552 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
553 // we will not touch these until the final copyout step
554 lno_nnz_view_t entriesC;
555 scalar_view_t valuesC;
556
557 // add this, since we do the rowptr in place, we could figure this out
558 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
559 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
560
561 // Thread-local memory
562 Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
563 Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
564
565 double thread_chunk = (double)(m) / thread_max;
566
567 // Run chunks of the matrix independently
568 Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
569 {
570 // Thread coordination stuff
571 size_t my_thread_start = tid * thread_chunk;
572 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
573 size_t my_thread_m = my_thread_stop - my_thread_start;
574
575 // Size estimate
576 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
577
578 // Allocations
579 std::vector<size_t> c_status(n,INVALID);
580
581 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
582 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
583
584 // For each row of A/C
585 size_t CSR_ip = 0, OLD_ip = 0;
586 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
587 // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
588 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
589 // on the calling process.
590 // JJE: directly access the rowptr here (indexed using our loop var)
591 row_mapC(i) = CSR_ip;
592 // NOTE: Vector::getLocalView returns a rank 2 view here
593 SC minusOmegaDval = -omega*Dvals(i,0);
594
595 // Entries of B
596 for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
597 const SC Bval = Bvals(j);
598 if (Bval == SC_ZERO)
599 continue;
600 LO Bij = Bcolind(j);
601 LO Cij = Bcol2Ccol(Bij);
602
603 // Assume no repeated entries in B
604 c_status[Cij] = CSR_ip;
605 Ccolind(CSR_ip) = Cij;
606 Cvals(CSR_ip) = Bvals[j];
607 CSR_ip++;
608 }
609
610 // Entries of -omega * Dinv * A * B
611 // mfh 27 Sep 2016: For each entry of A in the current row of A
612 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
613 LO Aik = Acolind(k); // local column index of current entry of A
614 const SC Aval = Avals(k); // value of current entry of A
615 if (Aval == SC_ZERO)
616 continue; // skip explicitly stored zero values in A
617
618 if (targetMapToOrigRow(Aik) != LO_INVALID) {
619 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
620 // corresponding to the current entry of A is populated, then
621 // the corresponding row of B is in B_local (i.e., it lives on
622 // the calling process).
623
624 // Local matrix
625 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
626
627 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
628 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
629 LO Bkj = Bcolind(j);
630 LO Cij = Bcol2Ccol(Bkj);
631
632 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
633 // New entry
634 c_status[Cij] = CSR_ip;
635 Ccolind(CSR_ip) = Cij;
636 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
637 CSR_ip++;
638
639 } else {
640 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
641 }
642 }
643
644 } else {
645 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
646 // corresponding to the current entry of A NOT populated (has
647 // a flag "invalid" value), then the corresponding row of B is
648 // in B_local (i.e., it lives on the calling process).
649
650 // Remote matrix
651 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
652 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
653 LO Ikj = Icolind(j);
654 LO Cij = Icol2Ccol(Ikj);
655
656 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
657 // New entry
658 c_status[Cij] = CSR_ip;
659 Ccolind(CSR_ip) = Cij;
660 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
661 CSR_ip++;
662
663 } else {
664 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
665 }
666 }
667 }
668 }
669
670 // Resize for next pass if needed
671 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
672 CSR_alloc *= 2;
673 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
674 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
675 }
676 OLD_ip = CSR_ip;
677 }
678 thread_total_nnz(tid) = CSR_ip;
679 tl_colind(tid) = Ccolind;
680 tl_values(tid) = Cvals;
681 });
682
683
684 // Do the copy out (removed the tl_rowptr!)
685 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
686
687
688#ifdef HAVE_TPETRA_MMM_TIMINGS
689 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
690#endif
691 // Sort & set values
692 if (params.is_null() || params->get("sort entries",true))
693 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
694 C.setAllValues(row_mapC,entriesC,valuesC);
695
696}
697
698
699
700/*********************************************************************************************************/
701template<class Scalar,
702 class LocalOrdinal,
703 class GlobalOrdinal,
704 class LocalOrdinalViewType>
705void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
706 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
707 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
708 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
709 const LocalOrdinalViewType & targetMapToOrigRow,
710 const LocalOrdinalViewType & targetMapToImportRow,
711 const LocalOrdinalViewType & Bcol2Ccol,
712 const LocalOrdinalViewType & Icol2Ccol,
713 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
714 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
715 const std::string& label,
716 const Teuchos::RCP<Teuchos::ParameterList>& params) {
717#ifdef HAVE_TPETRA_MMM_TIMINGS
718 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
719 using Teuchos::TimeMonitor;
720 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
721#endif
722 using Teuchos::Array;
723 using Teuchos::ArrayRCP;
724 using Teuchos::ArrayView;
725 using Teuchos::RCP;
726 using Teuchos::rcp;
727
728 // Lots and lots of typedefs
729 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
731 // typedef typename KCRS::device_type device_t;
732 typedef typename KCRS::StaticCrsGraphType graph_t;
733 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
734 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
735 typedef typename KCRS::values_type::non_const_type scalar_view_t;
736
737 // Jacobi-specific
738 typedef typename scalar_view_t::memory_space scalar_memory_space;
739
740 typedef Scalar SC;
741 typedef LocalOrdinal LO;
742 typedef GlobalOrdinal GO;
743 typedef Node NO;
744 typedef Map<LO,GO,NO> map_type;
745
746 // NOTE (mfh 15 Sep 2017) This is specifically only for
747 // execution_space = Kokkos::OpenMP, so we neither need nor want
748 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
749 typedef NO::execution_space execution_space;
750 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
751
752 // All of the invalid guys
753 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
754 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
755 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
756
757 // Grab the Kokkos::SparseCrsMatrices & inner stuff
758 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
759 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
760 const KCRS & Cmat = C.getLocalMatrixDevice();
761
762 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
763 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
764 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
765 scalar_view_t Cvals = Cmat.values;
766
767 c_lno_view_t Irowptr;
768 c_lno_nnz_view_t Icolind;
769 scalar_view_t Ivals;
770 if(!Bview.importMatrix.is_null()) {
771 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
772 Irowptr = lclB.graph.row_map;
773 Icolind = lclB.graph.entries;
774 Ivals = lclB.values;
775 }
776
777 // Jacobi-specific inner stuff
778 auto Dvals =
779 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
780
781 // Sizes
782 RCP<const map_type> Ccolmap = C.getColMap();
783 size_t m = Aview.origMatrix->getLocalNumRows();
784 size_t n = Ccolmap->getLocalNumElements();
785
786 // Get my node / thread info (right from openmp or parameter list)
787 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
788 if(!params.is_null()) {
789 if(params->isParameter("openmp: ltg thread max"))
790 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
791 }
792
793 double thread_chunk = (double)(m) / thread_max;
794
795 // Run chunks of the matrix independently
796 Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
797 {
798 // Thread coordination stuff
799 size_t my_thread_start = tid * thread_chunk;
800 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
801
802 // Allocations
803 std::vector<size_t> c_status(n,INVALID);
804
805 // For each row of A/C
806 size_t CSR_ip = 0, OLD_ip = 0;
807 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
808 // First fill the c_status array w/ locations where we're allowed to
809 // generate nonzeros for this row
810 OLD_ip = Crowptr(i);
811 CSR_ip = Crowptr(i+1);
812 // NOTE: Vector::getLocalView returns a rank 2 view here
813 SC minusOmegaDval = -omega*Dvals(i,0);
814
815 for (size_t k = OLD_ip; k < CSR_ip; k++) {
816 c_status[Ccolind(k)] = k;
817 // Reset values in the row of C
818 Cvals(k) = SC_ZERO;
819 }
820
821 // Entries of B
822 for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
823 const SC Bval = Bvals(j);
824 if (Bval == SC_ZERO)
825 continue;
826 LO Bij = Bcolind(j);
827 LO Cij = Bcol2Ccol(Bij);
828
829 // Assume no repeated entries in B
830 Cvals(c_status[Cij]) += Bvals(j);
831 CSR_ip++;
832 }
833
834
835 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
836 LO Aik = Acolind(k);
837 const SC Aval = Avals(k);
838 if (Aval == SC_ZERO)
839 continue;
840
841 if (targetMapToOrigRow(Aik) != LO_INVALID) {
842 // Local matrix
843 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
844
845 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
846 LO Bkj = Bcolind(j);
847 LO Cij = Bcol2Ccol(Bkj);
848
849 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
850 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
851 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
852
853 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
854 }
855 } else {
856 // Remote matrix
857 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
858 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
859 LO Ikj = Icolind(j);
860 LO Cij = Icol2Ccol(Ikj);
861
862 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
863 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
864 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
865
866 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
867 }
868 }
869 }
870 }
871 });
872
873 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
874}
875
876
877/*********************************************************************************************************/
878template<class InColindArrayType, class InValsArrayType,
879 class OutRowptrType, class OutColindType, class OutValsType>
880void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
881 const InColindArrayType& Incolind,
882 const InValsArrayType& Invalues,
883 const size_t m,
884 const double thread_chunk,
885 OutRowptrType& row_mapC,
886 OutColindType& entriesC,
887 OutValsType& valuesC ) {
888 typedef OutRowptrType lno_view_t;
889 typedef OutColindType lno_nnz_view_t;
890 typedef OutValsType scalar_view_t;
891 typedef typename lno_view_t::execution_space execution_space;
892 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
893
894 // Generate the starting nnz number per thread
895 size_t thread_max = Incolind.size();
896 size_t c_nnz_size=0;
897 // since this kernel is 'low thread count' is very likely, that we could
898 // sum over the thread_total_nnz and not go parallel, but since it is a view
899 // we don't know where the memory actually lives... so we kinda need to go parallel
900
901 lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
902 Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
903 size_t mynnz = thread_total_nnz(i);
904 if(final) thread_start_nnz(i) = update;
905 update+=mynnz;
906 if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
907 });
908 c_nnz_size = thread_start_nnz(thread_max);
909
910 // 2019 Apr 10 JJE: update the rowptr's final entry here
911 row_mapC(m) = thread_start_nnz(thread_max);
912
913 // Allocate output
914 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
915 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
916
917 // Copy out
918 Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
919 const size_t my_thread_start = tid * thread_chunk;
920 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
921 const size_t nnz_thread_start = thread_start_nnz(tid);
922 // we need this info, since we did the rowptr in place
923 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
924
925 // There are two fundamental operations:
926 // * Updateing the rowptr with the correct offset
927 // * copying entries and values
928
929 // First, update the rowptr, it since we did it inplace, it is a += operation
930 // in the paper, I experimented with a coloring scheme that had threads do
931 // do their copies in different orders. It wasn't obvious it was beneficial
932 // but it can be replicated, by choosing the array to copy first based on your
933 // thread id % 3
934 if (my_thread_start != 0 ) {
935 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
936 row_mapC(i) += nnz_thread_start;
937 }
938 }
939
940 // try to Kokkos::single() the alloc here. It should implicitly barrier
941 // thread 0 doesn't copy the rowptr, so it could hit the single first
942 // in the paper, I played a game that effectively got LTG down to a single
943 // OpenMP barrier. But doing that requires the ability to spawn a single
944 // parallel region. The Scan above was implemented using atomic adds
945 // and the barrier was only needed so you could allocate
946 //
947 // Since we can't spawn a single region, we could move the allocations
948 // here, and using 'single'. Most likely, thread 0 will hit it first allowing
949 // the other threads to update the rowptr while it allocates.
950
951
952 // next, bulk copy the vals/colind arrays
953 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
954 for (size_t i = 0; i < my_num_nnz; ++i) {
955 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
956 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
957 }
958
959 //Free the unamanged views, let each thread deallocate its memory
960 // May need to cast away const here..
961 if(Incolind(tid).data()) free(Incolind(tid).data());
962 if(Invalues(tid).data()) free(Invalues(tid).data());
963 });
964}//end copy_out
965
966#endif // OpenMP
967
968
969/*********************************************************************************************************/
970template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
971void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
972 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
973 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
974 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
975 const LocalOrdinalViewType & Acol2Brow,
976 const LocalOrdinalViewType & Acol2Irow,
977 const LocalOrdinalViewType & Bcol2Ccol,
978 const LocalOrdinalViewType & Icol2Ccol,
979 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
980 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
981 const std::string& label,
982 const Teuchos::RCP<Teuchos::ParameterList>& params) {
983#ifdef HAVE_TPETRA_MMM_TIMINGS
984 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
985 using Teuchos::TimeMonitor;
986 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
987 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
988 using Teuchos::rcp;
989#endif
990 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
991
992 // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
993
994 // 1) Multiply A*B
995 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
996 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
997
998#ifdef HAVE_TPETRA_MMM_TIMINGS
999 MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
1000#endif
1001
1002 // 2) Scale A by Dinv
1003 AB->leftScale(Dinv);
1004
1005#ifdef HAVE_TPETRA_MMM_TIMINGS
1006 MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
1007#endif
1008
1009 // 3) Add [-omega Dinv A] + B
1010 Teuchos::ParameterList jparams;
1011 if(!params.is_null()) {
1012 jparams = *params;
1013 jparams.set("label",label+std::string(" MSAK Add"));
1014 }
1015 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1016 Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
1017#ifdef HAVE_TPETRA_MMM_TIMINGS
1018 MM2=Teuchos::null;
1019#endif
1020 }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
1021
1022
1023
1024#if defined (HAVE_TPETRA_INST_OPENMP)
1025/*********************************************************************************************************/
1026template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1027static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1028 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1029 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1030 const LocalOrdinalViewType & Acol2PRow,
1031 const LocalOrdinalViewType & Acol2PRowImport,
1032 const LocalOrdinalViewType & Pcol2Accol,
1033 const LocalOrdinalViewType & PIcol2Accol,
1034 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1035 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1036 const std::string& label,
1037 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1038
1039 using Teuchos::RCP;
1040 using Tpetra::MatrixMatrix::UnmanagedView;
1041 #ifdef HAVE_TPETRA_MMM_TIMINGS
1042 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1043 using Teuchos::rcp;
1044 using Teuchos::TimeMonitor;
1045 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1046 #endif
1047
1048 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1049 typedef Scalar SC;
1050 typedef LocalOrdinal LO;
1051 typedef GlobalOrdinal GO;
1052 typedef Node NO;
1053 typedef Map<LO,GO,NO> map_type;
1055 typedef typename KCRS::StaticCrsGraphType graph_t;
1056 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1057 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1058 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1059 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1060 typedef typename KCRS::device_type device_t;
1061 typedef typename device_t::execution_space execution_space;
1062 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1063
1064 // Unmanaged versions of the above
1065 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1066 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1067 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1068
1069 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1070 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1071
1072 // Sizes
1073 RCP<const map_type> Accolmap = Ac.getColMap();
1074 size_t m = Rview.origMatrix->getLocalNumRows();
1075 size_t n = Accolmap->getLocalNumElements();
1076
1077 // Get raw Kokkos matrices, and the raw CSR views
1078 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1079 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1080 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1081
1082 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1083 Arowptr = Amat.graph.row_map,
1084 Prowptr = Pmat.graph.row_map, Irowptr;
1085 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1086 Acolind = Amat.graph.entries,
1087 Pcolind = Pmat.graph.entries;
1088 lno_nnz_view_t Icolind;
1089 const scalar_view_t Rvals = Rmat.values,
1090 Avals = Amat.values,
1091 Pvals = Pmat.values;
1092 scalar_view_t Ivals;
1093
1094 if (!Pview.importMatrix.is_null())
1095 {
1096 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1097 Irowptr = Imat.graph.row_map;
1098 Icolind = Imat.graph.entries;
1099 Ivals = Imat.values;
1100 }
1101
1102 // Classic csr assembly (low memory edition)
1103 //
1104 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1105 // The method loops over rows of R, and may resize after processing
1106 // each row. Chris Siefert says that this reflects experience in
1107 // ML; for the non-threaded case, ML found it faster to spend less
1108 // effort on estimation and risk an occasional reallocation.
1109
1110 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1111 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1112
1113 // Get my node / thread info (right from openmp or parameter list)
1114 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1115 if(!params.is_null()) {
1116 if(params->isParameter("openmp: ltg thread max"))
1117 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1118 }
1119
1120 double thread_chunk = (double)(m) / thread_max;
1121
1122 // we can construct the rowptr inplace, allocate here and fault in parallel
1123 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1124 // we do not touch these until copy out
1125 lno_nnz_view_t entriesAc;
1126 scalar_view_t valuesAc;
1127
1128 // add this, since we do the rowptr in place, we could figure this out
1129 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1130 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1131
1132 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1133 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1134 //
1135 // For column index Aik in row i of A, Acol2PRow[Aik] tells
1136 // you whether the corresponding row of P belongs to P_local
1137 // ("orig") or P_remote ("Import").
1138
1139 // Thread-local memory
1140 Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind", thread_max);
1141 Kokkos::View<u_scalar_view_t*> tl_values("top_values", thread_max);
1142
1143 // For each row of R
1144 Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1145 {
1146 // Thread coordination stuff
1147 size_t my_thread_start = tid * thread_chunk;
1148 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1149 size_t my_thread_m = my_thread_stop - my_thread_start;
1150
1151 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1152
1153 std::vector<size_t> ac_status(n, INVALID);
1154
1155 //manually allocate the thread-local storage for Ac
1156 u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1157 u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1158 u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1159
1160 //count entries as they are added to Ac
1161 size_t nnz = 0, nnz_old = 0;
1162 // bmk: loop over the rows of R which are assigned to thread tid
1163 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1164 // directly index into the rowptr
1165 rowmapAc(i) = nnz;
1166 // mfh 27 Sep 2016: For each entry of R in the current row of R
1167 for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1168 LO k = Rcolind(kk); // local column index of current entry of R
1169 const SC Rik = Rvals(kk); // value of current entry of R
1170 if (Rik == SC_ZERO)
1171 continue; // skip explicitly stored zero values in R
1172 // For each entry of A in the current row of A
1173 for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1174 LO l = Acolind(ll); // local column index of current entry of A
1175 const SC Akl = Avals(ll); // value of current entry of A
1176 if (Akl == SC_ZERO)
1177 continue; // skip explicitly stored zero values in A
1178
1179 if (Acol2PRow[l] != LO_INVALID) {
1180 // mfh 27 Sep 2016: If the entry of Acol2PRow
1181 // corresponding to the current entry of A is populated, then
1182 // the corresponding row of P is in P_local (i.e., it lives on
1183 // the calling process).
1184
1185 // Local matrix
1186 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1187
1188 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1189 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1190 LO j = Pcolind(jj);
1191 LO Acj = Pcol2Accol(j);
1192 SC Plj = Pvals(jj);
1193
1194 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1195 #ifdef HAVE_TPETRA_DEBUG
1196 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1197 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1198 std::runtime_error,
1199 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1200 #endif
1201 // New entry
1202 ac_status[Acj] = nnz;
1203 Accolind(nnz) = Acj;
1204 Acvals(nnz) = Rik*Akl*Plj;
1205 nnz++;
1206 } else {
1207 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1208 }
1209 }
1210 } else {
1211 // mfh 27 Sep 2016: If the entry of Acol2PRow
1212 // corresponding to the current entry of A is NOT populated (has
1213 // a flag "invalid" value), then the corresponding row of P is
1214 // in P_remote (i.e., it does not live on the calling process).
1215
1216 // Remote matrix
1217 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1218 for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1219 LO j = Icolind(jj);
1220 LO Acj = PIcol2Accol(j);
1221 SC Plj = Ivals(jj);
1222
1223 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1224 #ifdef HAVE_TPETRA_DEBUG
1225 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1226 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1227 std::runtime_error,
1228 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1229 #endif
1230 // New entry
1231 ac_status[Acj] = nnz;
1232 Accolind(nnz) = Acj;
1233 Acvals(nnz) = Rik*Akl*Plj;
1234 nnz++;
1235 } else {
1236 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1237 }
1238 }
1239 }
1240 }
1241 }
1242 // Resize for next pass if needed
1243 if (nnz + n > nnzAllocated) {
1244 nnzAllocated *= 2;
1245 Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1246 Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1247 }
1248 nnz_old = nnz;
1249 }
1250 thread_total_nnz(tid) = nnz;
1251 tl_colind(tid) = Accolind;
1252 tl_values(tid) = Acvals;
1253 });
1254 #ifdef HAVE_TPETRA_MMM_TIMINGS
1255 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1256 #endif
1257
1258 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1259
1260 #ifdef HAVE_TPETRA_MMM_TIMINGS
1261 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1262 #endif
1263
1264 // Final sort & set of CRS arrays
1265 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1266 // mfh 27 Sep 2016: This just sets pointers.
1267 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1268
1269 #ifdef HAVE_TPETRA_MMM_TIMINGS
1270 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1271 #endif
1272
1273 // Final FillComplete
1274 //
1275 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1276 // Import (from domain Map to column Map) construction (which costs
1277 // lots of communication) by taking the previously constructed
1278 // Import object. We should be able to do this without interfering
1279 // with the implementation of the local part of sparse matrix-matrix
1280 // multiply above.
1281 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1282 labelList->set("Timer Label",label);
1283 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1284 RCP<const Export<LO,GO,NO> > dummyExport;
1285 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1286 Rview.origMatrix->getRangeMap(),
1287 Acimport,
1288 dummyExport,
1289 labelList);
1290}
1291
1292
1293
1294/*********************************************************************************************************/
1295template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1296static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1297 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1298 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1299 const LocalOrdinalViewType & Acol2PRow,
1300 const LocalOrdinalViewType & Acol2PRowImport,
1301 const LocalOrdinalViewType & Pcol2Accol,
1302 const LocalOrdinalViewType & PIcol2Accol,
1303 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1304 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1305 const std::string& label,
1306 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1307
1308 using Teuchos::RCP;
1309 using Tpetra::MatrixMatrix::UnmanagedView;
1310 #ifdef HAVE_TPETRA_MMM_TIMINGS
1311 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1312 using Teuchos::TimeMonitor;
1313 using Teuchos::rcp;
1314 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1315 #endif
1316
1317 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1318 typedef Scalar SC;
1319 typedef LocalOrdinal LO;
1320 typedef GlobalOrdinal GO;
1321 typedef Node NO;
1322 typedef Map<LO,GO,NO> map_type;
1324 typedef typename KCRS::StaticCrsGraphType graph_t;
1325 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1326 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1327 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1328 typedef typename KCRS::device_type device_t;
1329 typedef typename device_t::execution_space execution_space;
1330 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1331
1332 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1333 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1334 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1335
1336 // Sizes
1337 RCP<const map_type> Accolmap = Ac.getColMap();
1338 size_t m = Rview.origMatrix->getLocalNumRows();
1339 size_t n = Accolmap->getLocalNumElements();
1340
1341 // Get raw Kokkos matrices, and the raw CSR views
1342 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1343 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1344 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1345 const KCRS & Cmat = Ac.getLocalMatrixDevice();
1346
1347 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1348 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1349 lno_nnz_view_t Icolind;
1350 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1351 scalar_view_t Cvals = Cmat.values;
1352 scalar_view_t Ivals;
1353
1354 if (!Pview.importMatrix.is_null())
1355 {
1356 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1357 Irowptr = Imat.graph.row_map;
1358 Icolind = Imat.graph.entries;
1359 Ivals = Imat.values;
1360 }
1361
1362 // Get my node / thread info (right from openmp or parameter list)
1363 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1364 if(!params.is_null()) {
1365 if(params->isParameter("openmp: ltg thread max"))
1366 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1367 }
1368
1369 double thread_chunk = (double)(m) / thread_max;
1370
1371 // For each row of R
1372 Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1373 {
1374 // Thread coordination stuff
1375 size_t my_thread_start = tid * thread_chunk;
1376 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1377
1378 std::vector<size_t> c_status(n, INVALID);
1379
1380 //count entries as they are added to Ac
1381 size_t OLD_ip = 0, CSR_ip = 0;
1382 // bmk: loop over the rows of R which are assigned to thread tid
1383 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1384 // First fill the c_status array w/ locations where we're allowed to
1385 // generate nonzeros for this row
1386 OLD_ip = Crowptr(i);
1387 CSR_ip = Crowptr(i+1);
1388 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1389 c_status[Ccolind(k)] = k;
1390 // Reset values in the row of C
1391 Cvals(k) = SC_ZERO;
1392 }
1393
1394 // mfh 27 Sep 2016: For each entry of R in the current row of R
1395 for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1396 LO k = Rcolind(kk); // local column index of current entry of R
1397 const SC Rik = Rvals(kk); // value of current entry of R
1398 if (Rik == SC_ZERO)
1399 continue; // skip explicitly stored zero values in R
1400 // For each entry of A in the current row of A
1401 for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1402 LO l = Acolind(ll); // local column index of current entry of A
1403 const SC Akl = Avals(ll); // value of current entry of A
1404 if (Akl == SC_ZERO)
1405 continue; // skip explicitly stored zero values in A
1406
1407 if (Acol2PRow[l] != LO_INVALID) {
1408 // mfh 27 Sep 2016: If the entry of Acol2PRow
1409 // corresponding to the current entry of A is populated, then
1410 // the corresponding row of P is in P_local (i.e., it lives on
1411 // the calling process).
1412
1413 // Local matrix
1414 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1415
1416 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1417 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1418 LO j = Pcolind(jj);
1419 LO Cij = Pcol2Accol(j);
1420 SC Plj = Pvals(jj);
1421 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1422 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1423 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1424
1425
1426 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1427 }
1428 } else {
1429 // mfh 27 Sep 2016: If the entry of Acol2PRow
1430 // corresponding to the current entry of A is NOT populated (has
1431 // a flag "invalid" value), then the corresponding row of P is
1432 // in P_remote (i.e., it does not live on the calling process).
1433
1434 // Remote matrix
1435 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1436 for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1437 LO j = Icolind(jj);
1438 LO Cij = PIcol2Accol(j);
1439 SC Plj = Ivals(jj);
1440 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1441 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1442 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1443
1444 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1445 }
1446 }
1447 }
1448 }
1449 }
1450 });
1451 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1452}
1453
1454
1455
1456#endif
1457
1458
1459}//ExtraKernels
1460}//MatrixMatrix
1461}//Tpetra
1462
1463
1464#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.