Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// ************************************************************************
38// @HEADER
39
40#ifndef TPETRA_CRSMATRIX_DEF_HPP
41#define TPETRA_CRSMATRIX_DEF_HPP
42
50
53#include "Tpetra_RowMatrix.hpp"
54#include "Tpetra_LocalCrsMatrixOperator.hpp"
55
62#include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
67#include "KokkosSparse_getDiagCopy.hpp"
71#include "Tpetra_Details_packCrsMatrix.hpp"
72#include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
74#include "Teuchos_FancyOStream.hpp"
75#include "Teuchos_RCP.hpp"
76#include "Teuchos_DataAccess.hpp"
77#include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
78#include "KokkosBlas.hpp"
79
80#include <memory>
81#include <sstream>
82#include <typeinfo>
83#include <utility>
84#include <vector>
85
86namespace Tpetra {
87
88namespace { // (anonymous)
89
90 template<class T, class BinaryFunction>
91 T atomic_binary_function_update (volatile T* const dest,
92 const T& inputVal,
93 BinaryFunction f)
94 {
95 T oldVal = *dest;
96 T assume;
97
98 // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
99 // POWER architectures, because 'newval' depends on 'assume',
100 // which depends on 'oldVal', which depends on '*dest'. This
101 // sets up a chain of read dependencies that should ensure
102 // correct behavior given a sane memory model.
103 do {
104 assume = oldVal;
105 T newVal = f (assume, inputVal);
106 oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
107 } while (assume != oldVal);
108
109 return oldVal;
110 }
111} // namespace (anonymous)
112
113//
114// Users must never rely on anything in the Details namespace.
115//
116namespace Details {
117
127template<class Scalar>
128struct AbsMax {
130 Scalar operator() (const Scalar& x, const Scalar& y) {
131 typedef Teuchos::ScalarTraits<Scalar> STS;
132 return std::max (STS::magnitude (x), STS::magnitude (y));
133 }
134};
135
136} // namespace Details
137} // namespace Tpetra
138
139namespace Tpetra {
140
141 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
143 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
144 size_t maxNumEntriesPerRow,
145 const Teuchos::RCP<Teuchos::ParameterList>& params) :
146 dist_object_type (rowMap)
147 {
148 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
149 "[, RCP<ParameterList>]): ";
150 Teuchos::RCP<crs_graph_type> graph;
151 try {
152 graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
153 params));
154 }
155 catch (std::exception& e) {
156 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
157 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
158 "size_t [, RCP<ParameterList>]) threw an exception: "
159 << e.what ());
160 }
161 // myGraph_ not null means that the matrix owns the graph. That's
162 // different than the const CrsGraph constructor, where the matrix
163 // does _not_ own the graph.
164 myGraph_ = graph;
165 staticGraph_ = myGraph_;
166 resumeFill (params);
168 }
169
170 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
172 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
173 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
174 const Teuchos::RCP<Teuchos::ParameterList>& params) :
175 dist_object_type (rowMap)
176 {
177 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
178 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
179 Teuchos::RCP<crs_graph_type> graph;
180 try {
181 using Teuchos::rcp;
182 graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
183 params));
184 }
185 catch (std::exception& e) {
186 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
187 (true, std::runtime_error, "CrsGraph constructor "
188 "(RCP<const Map>, ArrayView<const size_t>"
189 "[, RCP<ParameterList>]) threw an exception: "
190 << e.what ());
191 }
192 // myGraph_ not null means that the matrix owns the graph. That's
193 // different than the const CrsGraph constructor, where the matrix
194 // does _not_ own the graph.
195 myGraph_ = graph;
196 staticGraph_ = graph;
197 resumeFill (params);
199 }
200
201 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
203 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
204 const Teuchos::RCP<const map_type>& colMap,
205 const size_t maxNumEntPerRow,
206 const Teuchos::RCP<Teuchos::ParameterList>& params) :
207 dist_object_type (rowMap)
208 {
209 const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
210 "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
211 const char suffix[] =
212 " Please report this bug to the Tpetra developers.";
213
214 // An artifact of debugging something a while back.
215 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
216 (! staticGraph_.is_null (), std::logic_error,
217 "staticGraph_ is not null at the beginning of the constructor."
218 << suffix);
219 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
220 (! myGraph_.is_null (), std::logic_error,
221 "myGraph_ is not null at the beginning of the constructor."
222 << suffix);
223 Teuchos::RCP<crs_graph_type> graph;
224 try {
225 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
226 maxNumEntPerRow,
227 params));
228 }
229 catch (std::exception& e) {
230 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
231 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
232 "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
233 "exception: " << e.what ());
234 }
235 // myGraph_ not null means that the matrix owns the graph. That's
236 // different than the const CrsGraph constructor, where the matrix
237 // does _not_ own the graph.
238 myGraph_ = graph;
239 staticGraph_ = myGraph_;
240 resumeFill (params);
242 }
243
244 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
246 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
247 const Teuchos::RCP<const map_type>& colMap,
248 const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
249 const Teuchos::RCP<Teuchos::ParameterList>& params) :
250 dist_object_type (rowMap)
251 {
252 const char tfecfFuncName[] =
253 "CrsMatrix(RCP<const Map>, RCP<const Map>, "
254 "ArrayView<const size_t>[, RCP<ParameterList>]): ";
255 Teuchos::RCP<crs_graph_type> graph;
256 try {
257 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
258 numEntPerRowToAlloc,
259 params));
260 }
261 catch (std::exception& e) {
262 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
263 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
264 "RCP<const Map>, ArrayView<const size_t>[, "
265 "RCP<ParameterList>]) threw an exception: " << e.what ());
266 }
267 // myGraph_ not null means that the matrix owns the graph. That's
268 // different than the const CrsGraph constructor, where the matrix
269 // does _not_ own the graph.
270 myGraph_ = graph;
271 staticGraph_ = graph;
272 resumeFill (params);
274 }
275
276
277 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
279 CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
280 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
281 dist_object_type (graph->getRowMap ()),
282 staticGraph_ (graph),
283 storageStatus_ (Details::STORAGE_1D_PACKED)
284 {
285 using std::endl;
286 typedef typename local_matrix_device_type::values_type values_type;
287 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
288 "RCP<ParameterList>]): ";
289 const bool verbose = Details::Behavior::verbose("CrsMatrix");
290
291 std::unique_ptr<std::string> prefix;
292 if (verbose) {
293 prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
294 std::ostringstream os;
295 os << *prefix << "Start" << endl;
296 std::cerr << os.str ();
297 }
298
299 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
300 (graph.is_null (), std::runtime_error, "Input graph is null.");
301 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
302 (! graph->isFillComplete (), std::runtime_error, "Input graph "
303 "is not fill complete. You must call fillComplete on the "
304 "graph before using it to construct a CrsMatrix. Note that "
305 "calling resumeFill on the graph makes it not fill complete, "
306 "even if you had previously called fillComplete. In that "
307 "case, you must call fillComplete on the graph again.");
308
309 // The graph is fill complete, so it is locally indexed and has a
310 // fixed structure. This means we can allocate the (1-D) array of
311 // values and build the local matrix right now. Note that the
312 // local matrix's number of columns comes from the column Map, not
313 // the domain Map.
314
315 const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
316 if (verbose) {
317 std::ostringstream os;
318 os << *prefix << "Allocate values: " << numEnt << endl;
319 std::cerr << os.str ();
320 }
321
322 values_type val ("Tpetra::CrsMatrix::values", numEnt);
323 valuesPacked_wdv = values_wdv_type(val);
324 valuesUnpacked_wdv = valuesPacked_wdv;
325
327
328 if (verbose) {
329 std::ostringstream os;
330 os << *prefix << "Done" << endl;
331 std::cerr << os.str ();
332 }
333 }
334
335 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
338 const Teuchos::RCP<const crs_graph_type>& graph,
339 const Teuchos::RCP<Teuchos::ParameterList>& params) :
340 dist_object_type (graph->getRowMap ()),
341 staticGraph_ (graph),
342 storageStatus_ (matrix.storageStatus_)
343 {
344 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
345 "local_matrix_device_type::values_type, "
346 "[,RCP<ParameterList>]): ";
347 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
348 (graph.is_null (), std::runtime_error, "Input graph is null.");
349 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
350 (! graph->isFillComplete (), std::runtime_error, "Input graph "
351 "is not fill complete. You must call fillComplete on the "
352 "graph before using it to construct a CrsMatrix. Note that "
353 "calling resumeFill on the graph makes it not fill complete, "
354 "even if you had previously called fillComplete. In that "
355 "case, you must call fillComplete on the graph again.");
356
357 size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
358 valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
359
360 size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
361 valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
362
364 }
365
366
367 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369 CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
370 const typename local_matrix_device_type::values_type& values,
371 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
372 dist_object_type (graph->getRowMap ()),
373 staticGraph_ (graph),
374 storageStatus_ (Details::STORAGE_1D_PACKED)
375 {
376 const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
377 "local_matrix_device_type::values_type, "
378 "[,RCP<ParameterList>]): ";
379 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
380 (graph.is_null (), std::runtime_error, "Input graph is null.");
381 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
382 (! graph->isFillComplete (), std::runtime_error, "Input graph "
383 "is not fill complete. You must call fillComplete on the "
384 "graph before using it to construct a CrsMatrix. Note that "
385 "calling resumeFill on the graph makes it not fill complete, "
386 "even if you had previously called fillComplete. In that "
387 "case, you must call fillComplete on the graph again.");
388
389 // The graph is fill complete, so it is locally indexed and has a
390 // fixed structure. This means we can allocate the (1-D) array of
391 // values and build the local matrix right now. Note that the
392 // local matrix's number of columns comes from the column Map, not
393 // the domain Map.
394
395 valuesPacked_wdv = values_wdv_type(values);
396 valuesUnpacked_wdv = valuesPacked_wdv;
397
398 // FIXME (22 Jun 2016) I would very much like to get rid of
399 // k_values1D_ at some point. I find it confusing to have all
400 // these extra references lying around.
401 // KDDKDD ALMOST THERE, MARK!
402// k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
403
405 }
406
407 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
409 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
410 const Teuchos::RCP<const map_type>& colMap,
411 const typename local_graph_device_type::row_map_type& rowPointers,
412 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
413 const typename local_matrix_device_type::values_type& values,
414 const Teuchos::RCP<Teuchos::ParameterList>& params) :
415 dist_object_type (rowMap),
416 storageStatus_ (Details::STORAGE_1D_PACKED)
417 {
418 using Details::getEntryOnHost;
419 using Teuchos::RCP;
420 using std::endl;
421 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
422 "RCP<const Map>, ptr, ind, val[, params]): ";
423 const char suffix[] =
424 ". Please report this bug to the Tpetra developers.";
425 const bool debug = Details::Behavior::debug("CrsMatrix");
426 const bool verbose = Details::Behavior::verbose("CrsMatrix");
427
428 std::unique_ptr<std::string> prefix;
429 if (verbose) {
430 prefix = this->createPrefix(
431 "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
432 std::ostringstream os;
433 os << *prefix << "Start" << endl;
434 std::cerr << os.str ();
435 }
436
437 // Check the user's input. Note that this might throw only on
438 // some processes but not others, causing deadlock. We prefer
439 // deadlock due to exceptions to segfaults, because users can
440 // catch exceptions.
441 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
442 (values.extent(0) != columnIndices.extent(0),
443 std::invalid_argument, "values.extent(0)=" << values.extent(0)
444 << " != columnIndices.extent(0) = " << columnIndices.extent(0)
445 << ".");
446 if (debug && rowPointers.extent(0) != 0) {
447 const size_t numEnt =
448 getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
449 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
450 (numEnt != size_t(columnIndices.extent(0)) ||
451 numEnt != size_t(values.extent(0)),
452 std::invalid_argument, "Last entry of rowPointers says that "
453 "the matrix has " << numEnt << " entr"
454 << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
455 "columnIndices and values don't match this. "
456 "columnIndices.extent(0)=" << columnIndices.extent (0)
457 << " and values.extent(0)=" << values.extent (0) << ".");
458 }
459
460 RCP<crs_graph_type> graph;
461 try {
462 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
463 columnIndices, params));
464 }
465 catch (std::exception& e) {
466 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
467 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
468 "RCP<const Map>, ptr, ind[, params]) threw an exception: "
469 << e.what ());
470 }
471 // The newly created CrsGraph _must_ have a local graph at this
472 // point. We don't really care whether CrsGraph's constructor
473 // deep-copies or shallow-copies the input, but the dimensions
474 // have to be right. That's how we tell whether the CrsGraph has
475 // a local graph.
476 auto lclGraph = graph->getLocalGraphDevice ();
477 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
478 (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
479 lclGraph.entries.extent (0) != columnIndices.extent (0),
480 std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
481 "ind[, params]) did not set the local graph correctly." << suffix);
482 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
483 (lclGraph.entries.extent (0) != values.extent (0),
484 std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
485 "params]) did not set the local graph correctly. "
486 "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
487 << " != values.extent(0) = " << values.extent (0) << suffix);
488
489 // myGraph_ not null means that the matrix owns the graph. This
490 // is true because the column indices come in as nonconst,
491 // implying shared ownership.
492 myGraph_ = graph;
493 staticGraph_ = graph;
494
495 // The graph may not be fill complete yet. However, it is locally
496 // indexed (since we have a column Map) and has a fixed structure
497 // (due to the input arrays). This means we can allocate the
498 // (1-D) array of values and build the local matrix right now.
499 // Note that the local matrix's number of columns comes from the
500 // column Map, not the domain Map.
501
502 valuesPacked_wdv = values_wdv_type(values);
503 valuesUnpacked_wdv = valuesPacked_wdv;
504
505 // FIXME (22 Jun 2016) I would very much like to get rid of
506 // k_values1D_ at some point. I find it confusing to have all
507 // these extra references lying around.
508// this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
509
511 if (verbose) {
512 std::ostringstream os;
513 os << *prefix << "Done" << endl;
514 std::cerr << os.str();
515 }
516 }
517
518 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
520 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
521 const Teuchos::RCP<const map_type>& colMap,
522 const Teuchos::ArrayRCP<size_t>& ptr,
523 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
524 const Teuchos::ArrayRCP<Scalar>& val,
525 const Teuchos::RCP<Teuchos::ParameterList>& params) :
526 dist_object_type (rowMap),
527 storageStatus_ (Details::STORAGE_1D_PACKED)
528 {
529 using Kokkos::Compat::getKokkosViewDeepCopy;
530 using Teuchos::av_reinterpret_cast;
531 using Teuchos::RCP;
532 using values_type = typename local_matrix_device_type::values_type;
533 using IST = impl_scalar_type;
534 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
535 "RCP<const Map>, ptr, ind, val[, params]): ";
536
537 RCP<crs_graph_type> graph;
538 try {
539 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
540 ind, params));
541 }
542 catch (std::exception& e) {
543 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
544 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
545 "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
546 "RCP<ParameterList>]) threw an exception: " << e.what ());
547 }
548 // myGraph_ not null means that the matrix owns the graph. This
549 // is true because the column indices come in as nonconst,
550 // implying shared ownership.
551 myGraph_ = graph;
552 staticGraph_ = graph;
553
554 // The graph may not be fill complete yet. However, it is locally
555 // indexed (since we have a column Map) and has a fixed structure
556 // (due to the input arrays). This means we can allocate the
557 // (1-D) array of values and build the local matrix right now.
558 // Note that the local matrix's number of columns comes from the
559 // column Map, not the domain Map.
560
561 // The graph _must_ have a local graph at this point. We don't
562 // really care whether CrsGraph's constructor deep-copies or
563 // shallow-copies the input, but the dimensions have to be right.
564 // That's how we tell whether the CrsGraph has a local graph.
565 auto lclGraph = staticGraph_->getLocalGraphDevice ();
566 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
567 (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
568 size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
569 std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
570 "ptr, ind[, params]) did not set the local graph correctly. "
571 "Please report this bug to the Tpetra developers.");
572
573 values_type valIn =
574 getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
575 valuesPacked_wdv = values_wdv_type(valIn);
576 valuesUnpacked_wdv = valuesPacked_wdv;
577
578 // FIXME (22 Jun 2016) I would very much like to get rid of
579 // k_values1D_ at some point. I find it confusing to have all
580 // these extra references lying around.
581// this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
582
584 }
585
586 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
588 CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
589 const Teuchos::RCP<const map_type>& colMap,
590 const local_matrix_device_type& lclMatrix,
591 const Teuchos::RCP<Teuchos::ParameterList>& params) :
592 dist_object_type (rowMap),
593 storageStatus_ (Details::STORAGE_1D_PACKED),
594 fillComplete_ (true)
595 {
596 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
597 "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
598 const char suffix[] =
599 " Please report this bug to the Tpetra developers.";
600
601 Teuchos::RCP<crs_graph_type> graph;
602 try {
603 graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
604 lclMatrix.graph, params));
605 }
606 catch (std::exception& e) {
607 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
608 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
609 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
610 "exception: " << e.what ());
611 }
612 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
613 (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
614 "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
615 "did not produce a fill-complete graph. Please report this bug to the "
616 "Tpetra developers.");
617 // myGraph_ not null means that the matrix owns the graph. This
618 // is true because the column indices come in as nonconst through
619 // the matrix, implying shared ownership.
620 myGraph_ = graph;
621 staticGraph_ = graph;
622
623 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
624 valuesUnpacked_wdv = valuesPacked_wdv;
625
626 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
627 (isFillActive (), std::logic_error,
628 "At the end of a CrsMatrix constructor that should produce "
629 "a fillComplete matrix, isFillActive() is true." << suffix);
630 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
631 (! isFillComplete (), std::logic_error, "At the end of a "
632 "CrsMatrix constructor that should produce a fillComplete "
633 "matrix, isFillComplete() is false." << suffix);
635 }
636
637 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
639 CrsMatrix (const local_matrix_device_type& lclMatrix,
640 const Teuchos::RCP<const map_type>& rowMap,
641 const Teuchos::RCP<const map_type>& colMap,
642 const Teuchos::RCP<const map_type>& domainMap,
643 const Teuchos::RCP<const map_type>& rangeMap,
644 const Teuchos::RCP<Teuchos::ParameterList>& params) :
645 dist_object_type (rowMap),
646 storageStatus_ (Details::STORAGE_1D_PACKED),
647 fillComplete_ (true)
648 {
649 const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
650 "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
651 "local_matrix_device_type[, RCP<ParameterList>]): ";
652 const char suffix[] =
653 " Please report this bug to the Tpetra developers.";
654
655 Teuchos::RCP<crs_graph_type> graph;
656 try {
657 graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
658 domainMap, rangeMap, params));
659 }
660 catch (std::exception& e) {
661 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
662 (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
663 "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
664 "RCP<ParameterList>]) threw an exception: " << e.what ());
665 }
666 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
667 (! graph->isFillComplete (), std::logic_error, "CrsGraph "
668 "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
669 "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
670 "not produce a fillComplete graph." << suffix);
671 // myGraph_ not null means that the matrix owns the graph. This
672 // is true because the column indices come in as nonconst through
673 // the matrix, implying shared ownership.
674 myGraph_ = graph;
675 staticGraph_ = graph;
676
677 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
678 valuesUnpacked_wdv = valuesPacked_wdv;
679
680 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
681 (isFillActive (), std::logic_error,
682 "At the end of a CrsMatrix constructor that should produce "
683 "a fillComplete matrix, isFillActive() is true." << suffix);
684 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
685 (! isFillComplete (), std::logic_error, "At the end of a "
686 "CrsMatrix constructor that should produce a fillComplete "
687 "matrix, isFillComplete() is false." << suffix);
689 }
691 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
693 CrsMatrix (const local_matrix_device_type& lclMatrix,
694 const Teuchos::RCP<const map_type>& rowMap,
695 const Teuchos::RCP<const map_type>& colMap,
696 const Teuchos::RCP<const map_type>& domainMap,
697 const Teuchos::RCP<const map_type>& rangeMap,
698 const Teuchos::RCP<const import_type>& importer,
699 const Teuchos::RCP<const export_type>& exporter,
700 const Teuchos::RCP<Teuchos::ParameterList>& params) :
701 dist_object_type (rowMap),
702 storageStatus_ (Details::STORAGE_1D_PACKED),
703 fillComplete_ (true)
704 {
705 using Teuchos::rcp;
706 const char tfecfFuncName[] = "Tpetra::CrsMatrix"
707 "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
708 const char suffix[] =
709 " Please report this bug to the Tpetra developers.";
710
711 Teuchos::RCP<crs_graph_type> graph;
712 try {
713 graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
714 domainMap, rangeMap, importer,
715 exporter, params));
716 }
717 catch (std::exception& e) {
718 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
719 (true, std::runtime_error, "CrsGraph constructor "
720 "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
721 "params) threw: " << e.what ());
722 }
723 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
724 (!graph->isFillComplete (), std::logic_error, "CrsGraph "
725 "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
726 "Export, params) did not produce a fill-complete graph. "
727 "Please report this bug to the Tpetra developers.");
728 // myGraph_ not null means that the matrix owns the graph. This
729 // is true because the column indices come in as nonconst through
730 // the matrix, implying shared ownership.
731 myGraph_ = graph;
732 staticGraph_ = graph;
733
734 valuesPacked_wdv = values_wdv_type(lclMatrix.values);
735 valuesUnpacked_wdv = valuesPacked_wdv;
736
737 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
738 (isFillActive (), std::logic_error,
739 "At the end of a CrsMatrix constructor that should produce "
740 "a fillComplete matrix, isFillActive() is true." << suffix);
741 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
742 (! isFillComplete (), std::logic_error, "At the end of a "
743 "CrsMatrix constructor that should produce a fillComplete "
744 "matrix, isFillComplete() is false." << suffix);
746 }
747
748 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
751 const Teuchos::DataAccess copyOrView):
752 dist_object_type (source.getCrsGraph()->getRowMap ()),
753 staticGraph_ (source.getCrsGraph()),
754 storageStatus_ (source.storageStatus_)
755 {
756 const char tfecfFuncName[] = "Tpetra::CrsMatrix("
757 "const CrsMatrix&, const Teuchos::DataAccess): ";
758 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
759 (! source.isFillComplete (), std::invalid_argument,
760 "Source graph must be fillComplete().");
761
762 if (copyOrView == Teuchos::Copy) {
763 using values_type = typename local_matrix_device_type::values_type;
764 auto vals = source.getLocalValuesDevice (Access::ReadOnly);
765 using Kokkos::view_alloc;
766 using Kokkos::WithoutInitializing;
767 values_type newvals (view_alloc ("val", WithoutInitializing),
768 vals.extent (0));
769 // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
770 Kokkos::deep_copy (newvals, vals);
771 valuesPacked_wdv = values_wdv_type(newvals);
772 valuesUnpacked_wdv = valuesPacked_wdv;
773 fillComplete (source.getDomainMap (), source.getRangeMap ());
774 }
775 else if (copyOrView == Teuchos::View) {
776 valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
777 valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
778 fillComplete (source.getDomainMap (), source.getRangeMap ());
779 }
780 else {
781 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
782 (true, std::invalid_argument, "Second argument 'copyOrView' "
783 "has an invalid value " << copyOrView << ". Valid values "
784 "include Teuchos::Copy = " << Teuchos::Copy << " and "
785 "Teuchos::View = " << Teuchos::View << ".");
786 }
787 checkInternalState();
788 }
789
790 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
791 void
794 {
795 std::swap(crs_matrix.importMV_, this->importMV_);
796 std::swap(crs_matrix.exportMV_, this->exportMV_);
797 std::swap(crs_matrix.staticGraph_, this->staticGraph_);
798 std::swap(crs_matrix.myGraph_, this->myGraph_);
799 std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
800 std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
801 std::swap(crs_matrix.storageStatus_, this->storageStatus_);
802 std::swap(crs_matrix.fillComplete_, this->fillComplete_);
803 std::swap(crs_matrix.nonlocals_, this->nonlocals_);
804 }
805
806 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
807 Teuchos::RCP<const Teuchos::Comm<int> >
809 getComm () const {
810 return getCrsGraphRef ().getComm ();
811 }
813 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
814 bool
816 isFillComplete () const {
817 return fillComplete_;
818 }
819
820 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
821 bool
823 isFillActive () const {
824 return ! fillComplete_;
825 }
826
827 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
828 bool
830 isStorageOptimized () const {
831 return this->getCrsGraphRef ().isStorageOptimized ();
832 }
833
834 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
835 bool
837 isLocallyIndexed () const {
838 return getCrsGraphRef ().isLocallyIndexed ();
839 }
840
841 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
842 bool
844 isGloballyIndexed () const {
845 return getCrsGraphRef ().isGloballyIndexed ();
846 }
848 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
849 bool
851 hasColMap () const {
852 return getCrsGraphRef ().hasColMap ();
853 }
854
855 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
859 return getCrsGraphRef ().getGlobalNumEntries ();
860 }
861
862 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
863 size_t
865 getLocalNumEntries () const {
866 return getCrsGraphRef ().getLocalNumEntries ();
867 }
868
869 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
872 getGlobalNumRows () const {
873 return getCrsGraphRef ().getGlobalNumRows ();
874 }
875
876 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
879 getGlobalNumCols () const {
880 return getCrsGraphRef ().getGlobalNumCols ();
881 }
882
883 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
884 size_t
886 getLocalNumRows () const {
887 return getCrsGraphRef ().getLocalNumRows ();
888 }
889
890
891 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892 size_t
894 getLocalNumCols () const {
895 return getCrsGraphRef ().getLocalNumCols ();
896 }
897
898
899 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
900 size_t
902 getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
903 return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
904 }
905
906 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
907 size_t
909 getNumEntriesInLocalRow (LocalOrdinal localRow) const {
910 return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
911 }
912
913 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
914 size_t
917 return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
918 }
919
920 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921 size_t
924 return getCrsGraphRef ().getLocalMaxNumRowEntries ();
925 }
926
927 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
928 GlobalOrdinal
930 getIndexBase () const {
931 return getRowMap ()->getIndexBase ();
932 }
933
934 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
937 getRowMap () const {
938 return getCrsGraphRef ().getRowMap ();
939 }
940
941 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
942 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
944 getColMap () const {
945 return getCrsGraphRef ().getColMap ();
946 }
947
948 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
949 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
951 getDomainMap () const {
952 return getCrsGraphRef ().getDomainMap ();
953 }
954
955 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
956 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
958 getRangeMap () const {
959 return getCrsGraphRef ().getRangeMap ();
960 }
961
962 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963 Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
965 getGraph () const {
966 if (staticGraph_ != Teuchos::null) {
967 return staticGraph_;
968 }
969 return myGraph_;
970 }
971
972 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
973 Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
975 getCrsGraph () const {
976 if (staticGraph_ != Teuchos::null) {
977 return staticGraph_;
978 }
979 return myGraph_;
980 }
981
982 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
983 const CrsGraph<LocalOrdinal, GlobalOrdinal, Node>&
984 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
985 getCrsGraphRef () const
986 {
987#ifdef HAVE_TPETRA_DEBUG
988 constexpr bool debug = true;
989#else
990 constexpr bool debug = false;
991#endif // HAVE_TPETRA_DEBUG
992
993 if (! this->staticGraph_.is_null ()) {
994 return * (this->staticGraph_);
996 else {
997 if (debug) {
998 const char tfecfFuncName[] = "getCrsGraphRef: ";
999 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1000 (this->myGraph_.is_null (), std::logic_error,
1001 "Both staticGraph_ and myGraph_ are null. "
1002 "Please report this bug to the Tpetra developers.");
1003 }
1004 return * (this->myGraph_);
1005 }
1006 }
1007
1008 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1009 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
1011 getLocalMatrixDevice () const
1012 {
1013 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1014 return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
1015 numCols,
1016 valuesPacked_wdv.getDeviceView(Access::ReadWrite),
1017 staticGraph_->getLocalGraphDevice());
1018 }
1019
1020 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1021 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
1023 getLocalMatrixHost () const
1024 {
1025 auto numCols = staticGraph_->getColMap()->getLocalNumElements();
1026 return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1027 valuesPacked_wdv.getHostView(Access::ReadWrite),
1028 staticGraph_->getLocalGraphHost());
1029 }
1030
1031// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1032 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1033 std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1036 {
1037 auto localMatrix = getLocalMatrixDevice();
1038#ifdef HAVE_TPETRACORE_CUDA
1039#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
1040 if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()) &&
1041 std::is_same<Node, Kokkos::Compat::KokkosCudaWrapperNode>::value)
1042 {
1043 if(this->ordinalRowptrs.data() == nullptr)
1044 {
1045 auto originalRowptrs = localMatrix.graph.row_map;
1046 //create LocalOrdinal-typed copy of the local graph's rowptrs.
1047 //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1048 this->ordinalRowptrs = ordinal_rowptrs_type(
1049 Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1050 auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1051 Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1052 Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1053 KOKKOS_LAMBDA(LocalOrdinal i)
1054 {
1055 ordinalRowptrs_(i) = originalRowptrs(i);
1056 });
1057 }
1058 //return local operator using ordinalRowptrs
1059 return std::make_shared<local_multiply_op_type>(
1060 std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1061 }
1062#endif
1063#endif
1064// KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1065 return std::make_shared<local_multiply_op_type>(
1066 std::make_shared<local_matrix_device_type>(localMatrix));
1067 }
1069 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1070 bool
1072 isStaticGraph () const {
1073 return myGraph_.is_null ();
1074 }
1075
1076 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1077 bool
1079 hasTransposeApply () const {
1080 return true;
1081 }
1082
1083 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1084 bool
1087 return true;
1088 }
1089
1090 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1091 void
1093 allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1094 const bool verbose)
1095 {
1096 using Details::Behavior;
1098 using std::endl;
1099 const char tfecfFuncName[] = "allocateValues: ";
1100 const char suffix[] =
1101 " Please report this bug to the Tpetra developers.";
1102 ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1103
1104 std::unique_ptr<std::string> prefix;
1105 if (verbose) {
1106 prefix = this->createPrefix("CrsMatrix", "allocateValues");
1107 std::ostringstream os;
1108 os << *prefix << "lg: "
1109 << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1110 << ", gas: Graph"
1111 << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1112 << "Allocated" << endl;
1113 std::cerr << os.str();
1114 }
1115
1116 const bool debug = Behavior::debug("CrsMatrix");
1117 if (debug) {
1118 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119 (this->staticGraph_.is_null (), std::logic_error,
1120 "staticGraph_ is null." << suffix);
1121
1122 // If the graph indices are already allocated, then gas should be
1123 // GraphAlreadyAllocated. Otherwise, gas should be
1124 // GraphNotYetAllocated.
1125 if ((gas == GraphAlreadyAllocated) !=
1126 staticGraph_->indicesAreAllocated ()) {
1127 const char err1[] = "The caller has asserted that the graph "
1128 "is ";
1129 const char err2[] = "already allocated, but the static graph "
1130 "says that its indices are ";
1131 const char err3[] = "already allocated. ";
1132 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133 (gas == GraphAlreadyAllocated &&
1134 ! staticGraph_->indicesAreAllocated (), std::logic_error,
1135 err1 << err2 << "not " << err3 << suffix);
1136 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1137 (gas != GraphAlreadyAllocated &&
1138 staticGraph_->indicesAreAllocated (), std::logic_error,
1139 err1 << "not " << err2 << err3 << suffix);
1140 }
1141
1142 // If the graph is unallocated, then it had better be a
1143 // matrix-owned graph. ("Matrix-owned graph" means that the
1144 // matrix gets to define the graph structure. If the CrsMatrix
1145 // constructor that takes an RCP<const CrsGraph> was used, then
1146 // the matrix does _not_ own the graph.)
1147 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1148 (! this->staticGraph_->indicesAreAllocated () &&
1149 this->myGraph_.is_null (), std::logic_error,
1150 "The static graph says that its indices are not allocated, "
1151 "but the graph is not owned by the matrix." << suffix);
1152 }
1153
1154 if (gas == GraphNotYetAllocated) {
1155 if (debug) {
1156 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1157 (this->myGraph_.is_null (), std::logic_error,
1158 "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1159 }
1160 try {
1161 this->myGraph_->allocateIndices (lg, verbose);
1162 }
1163 catch (std::exception& e) {
1164 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1165 (true, std::runtime_error, "CrsGraph::allocateIndices "
1166 "threw an exception: " << e.what ());
1167 }
1168 catch (...) {
1169 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1170 (true, std::runtime_error, "CrsGraph::allocateIndices "
1171 "threw an exception not a subclass of std::exception.");
1172 }
1173 }
1174
1175 // Allocate matrix values.
1176 const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1177 typename Graph::local_graph_device_type::row_map_type k_ptrs =
1178 this->staticGraph_->rowPtrsUnpacked_dev_;
1179
1180 const size_t lclTotalNumEntries =
1181 this->staticGraph_->rowPtrsUnpacked_host_(lclNumRows);
1182
1183 // Allocate array of (packed???) matrix values.
1184 using values_type = typename local_matrix_device_type::values_type;
1185 if (verbose) {
1186 std::ostringstream os;
1187 os << *prefix << "Allocate values_wdv: Pre "
1188 << valuesUnpacked_wdv.extent(0) << ", post "
1189 << lclTotalNumEntries << endl;
1190 std::cerr << os.str();
1191 }
1192// this->k_values1D_ =
1193 valuesUnpacked_wdv = values_wdv_type(
1194 values_type("Tpetra::CrsMatrix::values",
1195 lclTotalNumEntries));
1196 }
1197
1198
1199 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1200 void
1202 fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1203 {
1204 using ::Tpetra::Details::computeOffsetsFromCounts;
1205 using ::Tpetra::Details::getEntryOnHost;
1206 using Teuchos::arcp_const_cast;
1207 using Teuchos::Array;
1208 using Teuchos::ArrayRCP;
1209 using Teuchos::null;
1210 using Teuchos::RCP;
1211 using Teuchos::rcp;
1212 using std::endl;
1213 using row_map_type = typename local_graph_device_type::row_map_type;
1214 using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1215 using values_type = typename local_matrix_device_type::values_type;
1216 Details::ProfilingRegion regionFLGAM
1217 ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1218
1219 const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1220 "fillComplete or expertStaticFillComplete): ";
1221 const char suffix[] =
1222 " Please report this bug to the Tpetra developers.";
1223 const bool debug = Details::Behavior::debug("CrsMatrix");
1224 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1225
1226 std::unique_ptr<std::string> prefix;
1227 if (verbose) {
1228 prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1229 std::ostringstream os;
1230 os << *prefix << endl;
1231 std::cerr << os.str ();
1232 }
1233
1234 if (debug) {
1235 // fillComplete() only calls fillLocalGraphAndMatrix() if the
1236 // matrix owns the graph, which means myGraph_ is not null.
1237 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1238 (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1239 "(myGraph_) is null. This means that the matrix has a "
1240 "const (a.k.a. \"static\") graph. fillComplete or "
1241 "expertStaticFillComplete should never call "
1242 "fillLocalGraphAndMatrix in that case." << suffix);
1243 }
1244
1245 const size_t lclNumRows = this->getLocalNumRows ();
1246
1247 // This method's goal is to fill in the three arrays (compressed
1248 // sparse row format) that define the sparse graph's and matrix's
1249 // structure, and the sparse matrix's values.
1251 // Get references to the data in myGraph_, so we can modify them
1252 // as well. Note that we only call fillLocalGraphAndMatrix() if
1253 // the matrix owns the graph, which means myGraph_ is not null.
1254
1255 typedef decltype (myGraph_->k_numRowEntries_) row_entries_type;
1256
1257 typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1258 myGraph_->rowPtrsUnpacked_dev_;
1259
1260 if (debug) {
1261 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1262 (curRowOffsets.extent (0) == 0, std::logic_error,
1263 "curRowOffsets.extent(0) == 0.");
1264 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1265 (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1266 "curRowOffsets.extent(0) = "
1267 << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1268 << (lclNumRows + 1) << ".");
1269 const size_t numOffsets = curRowOffsets.extent (0);
1270 const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1271 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1272 (numOffsets != 0 &&
1273 myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1274 std::logic_error, "numOffsets = " <<
1275 numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1276 << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1277 << numOffsets << ") = " << valToCheck << ".");
1278 }
1279
1280 if (myGraph_->getLocalNumEntries() !=
1281 myGraph_->getLocalAllocationSize()) {
1282
1283 // Use the nonconst version of row_map_type for k_ptrs,
1284 // because row_map_type is const and we need to modify k_ptrs here.
1285 typename row_map_type::non_const_type k_ptrs;
1286 row_map_type k_ptrs_const;
1287 lclinds_1d_type k_inds;
1288 values_type k_vals;
1289
1290 if (verbose) {
1291 std::ostringstream os;
1292 const auto numEnt = myGraph_->getLocalNumEntries();
1293 const auto allocSize = myGraph_->getLocalAllocationSize();
1294 os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1295 << ", allocSize=" << allocSize << endl;
1296 std::cerr << os.str ();
1297 }
1298 // The matrix's current 1-D storage is "unpacked." This means
1299 // the row offsets may differ from what the final row offsets
1300 // should be. This could happen, for example, if the user
1301 // set an upper
1302 // bound on the number of entries per row, but didn't fill all
1303 // those entries.
1304 if (debug && curRowOffsets.extent (0) != 0) {
1305 const size_t numOffsets =
1306 static_cast<size_t> (curRowOffsets.extent (0));
1307 const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1308 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1309 (static_cast<size_t> (valToCheck) !=
1310 static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1311 std::logic_error, "(unpacked branch) Before "
1312 "allocating or packing, curRowOffsets(" << (numOffsets-1)
1313 << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1314 " = " << valuesUnpacked_wdv.extent (0) << ".");
1315 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1316 (static_cast<size_t> (valToCheck) !=
1317 static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1318 std::logic_error, "(unpacked branch) Before "
1319 "allocating or packing, curRowOffsets(" << (numOffsets-1)
1320 << ") = " << valToCheck
1321 << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1322 << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1323 }
1324 // Pack the row offsets into k_ptrs, by doing a sum-scan of
1325 // the array of valid entry counts per row.
1326
1327 // Total number of entries in the matrix on the calling
1328 // process. We will compute this in the loop below. It's
1329 // cheap to compute and useful as a sanity check.
1330 size_t lclTotalNumEntries = 0;
1331 {
1332 // Allocate the packed row offsets array. We use a nonconst
1333 // temporary (packedRowOffsets) here, because k_ptrs is
1334 // const. We will assign packedRowOffsets to k_ptrs below.
1335 if (verbose) {
1336 std::ostringstream os;
1337 os << *prefix << "Allocate packed row offsets: "
1338 << (lclNumRows+1) << endl;
1339 std::cerr << os.str ();
1340 }
1341 typename row_map_type::non_const_type
1342 packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1343 typename row_entries_type::const_type numRowEnt_h =
1344 myGraph_->k_numRowEntries_;
1345 // We're computing offsets on device. This function can
1346 // handle numRowEnt_h being a host View.
1347 lclTotalNumEntries =
1348 computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1349 // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1350 // to use packedRowOffsets in the loop above and assign here.
1351 k_ptrs = packedRowOffsets;
1352 k_ptrs_const = k_ptrs;
1353 }
1354
1355 if (debug) {
1356 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1357 (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1358 std::logic_error,
1359 "(unpacked branch) After packing k_ptrs, "
1360 "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1361 "lclNumRows+1 = " << (lclNumRows+1) << ".");
1362 const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1363 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1364 (valToCheck != lclTotalNumEntries, std::logic_error,
1365 "(unpacked branch) After filling k_ptrs, "
1366 "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1367 << " != total number of entries on the calling process = "
1368 << lclTotalNumEntries << ".");
1369 }
1370
1371 // Allocate the arrays of packed column indices and values.
1372 if (verbose) {
1373 std::ostringstream os;
1374 os << *prefix << "Allocate packed local column indices: "
1375 << lclTotalNumEntries << endl;
1376 std::cerr << os.str ();
1377 }
1378 k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1379 if (verbose) {
1380 std::ostringstream os;
1381 os << *prefix << "Allocate packed values: "
1382 << lclTotalNumEntries << endl;
1383 std::cerr << os.str ();
1384 }
1385 k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1387 // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1388 // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1389 // the packed row offsets array k_ptrs that we created above.
1390 //
1391 // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1392 // need to keep around the unpacked row offsets, column
1393 // indices, and values arrays.
1394
1395 // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1396 // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1397 using inds_packer_type = pack_functor<
1398 typename Graph::local_graph_device_type::entries_type::non_const_type,
1399 typename Graph::local_inds_dualv_type::t_dev::const_type,
1400 typename Graph::local_graph_device_type::row_map_type::non_const_type,
1401 typename Graph::local_graph_device_type::row_map_type>;
1402 inds_packer_type indsPacker (
1403 k_inds,
1404 myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1405 k_ptrs, curRowOffsets);
1406 using exec_space = typename decltype (k_inds)::execution_space;
1407 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1408 Kokkos::parallel_for
1409 ("Tpetra::CrsMatrix pack column indices",
1410 range_type (0, lclNumRows), indsPacker);
1411
1412 // Pack the values from unpacked valuesUnpacked_wdv into packed
1413 // k_vals. We will replace valuesPacked_wdv below.
1414 using vals_packer_type = pack_functor<
1415 typename values_type::non_const_type,
1416 typename values_type::const_type,
1417 typename row_map_type::non_const_type,
1418 typename row_map_type::const_type>;
1419 vals_packer_type valsPacker (
1420 k_vals,
1421 this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1422 k_ptrs, curRowOffsets);
1423 Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1424 range_type (0, lclNumRows), valsPacker);
1425
1426 if (debug) {
1427 const char myPrefix[] = "(\"Optimize Storage\""
1428 "=true branch) After packing, ";
1429 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1430 (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1431 << "k_ptrs.extent(0) = 0. This probably means that "
1432 "rowPtrsUnpacked_ was never allocated.");
1433 if (k_ptrs.extent (0) != 0) {
1434 const size_t numOffsets (k_ptrs.extent (0));
1435 const auto valToCheck =
1436 getEntryOnHost (k_ptrs, numOffsets - 1);
1437 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1438 (size_t (valToCheck) != k_vals.extent (0),
1439 std::logic_error, myPrefix <<
1440 "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1441 " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1442 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1443 (size_t (valToCheck) != k_inds.extent (0),
1444 std::logic_error, myPrefix <<
1445 "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1446 " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1447 }
1448 }
1449 // Build the local graph.
1450 myGraph_->setRowPtrsPacked(k_ptrs_const);
1451 myGraph_->lclIndsPacked_wdv =
1452 typename crs_graph_type::local_inds_wdv_type(k_inds);
1453 valuesPacked_wdv = values_wdv_type(k_vals);
1454 }
1455 else { // We don't have to pack, so just set the pointers.
1456 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1457 // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1458 myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1459 myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1460 myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1461 valuesPacked_wdv = valuesUnpacked_wdv;
1462
1463 if (verbose) {
1464 std::ostringstream os;
1465 os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1466 << myGraph_->rowPtrsUnpacked_host_.extent(0) << ", lclIndsUnpacked_wdv: "
1467 << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1468 << valuesUnpacked_wdv.extent(0) << endl;
1469 std::cerr << os.str();
1470 }
1471
1472 if (debug) {
1473 const char myPrefix[] =
1474 "(\"Optimize Storage\"=false branch) ";
1475 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1476 (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1477 << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1478 "that rowPtrsUnpacked_ was never allocated.");
1479 if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1480 const size_t numOffsets (myGraph_->rowPtrsUnpacked_host_.extent (0));
1481 const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1482 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1483 (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1484 std::logic_error, myPrefix <<
1485 "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1486 << " != valuesPacked_wdv.extent(0) = "
1487 << valuesPacked_wdv.extent (0) << ".");
1488 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1489 (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1490 std::logic_error, myPrefix <<
1491 "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1492 << " != myGraph_->lclIndsPacked.extent(0) = "
1493 << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1494 }
1496 }
1497
1498 if (debug) {
1499 const char myPrefix[] = "After packing, ";
1500 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1501 (size_t (myGraph_->rowPtrsPacked_host_.extent (0)) != size_t (lclNumRows + 1),
1502 std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1503 << myGraph_->rowPtrsPacked_host_.extent (0) << " != lclNumRows+1 = " <<
1504 (lclNumRows+1) << ".");
1505 if (myGraph_->rowPtrsPacked_host_.extent (0) != 0) {
1506 const size_t numOffsets (myGraph_->rowPtrsPacked_host_.extent (0));
1507 const size_t valToCheck = myGraph_->rowPtrsPacked_host_(numOffsets-1);
1508 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1509 (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1510 std::logic_error, myPrefix << "k_ptrs_const(" <<
1511 (numOffsets-1) << ") = " << valToCheck
1512 << " != valuesPacked_wdv.extent(0) = "
1513 << valuesPacked_wdv.extent (0) << ".");
1514 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1515 (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1516 std::logic_error, myPrefix << "k_ptrs_const(" <<
1517 (numOffsets-1) << ") = " << valToCheck
1518 << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1519 << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1520 }
1521 }
1522
1523 // May we ditch the old allocations for the packed (and otherwise
1524 // "optimized") allocations, later in this routine? Optimize
1525 // storage if the graph is not static, or if the graph already has
1526 // optimized storage.
1527 const bool defaultOptStorage =
1528 ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1529 const bool requestOptimizedStorage =
1530 (! params.is_null () &&
1531 params->get ("Optimize Storage", defaultOptStorage)) ||
1532 (params.is_null () && defaultOptStorage);
1533
1534 // The graph has optimized storage when indices are allocated,
1535 // myGraph_->k_numRowEntries_ is empty, and there are more than
1536 // zero rows on this process.
1537 if (requestOptimizedStorage) {
1538 // Free the old, unpacked, unoptimized allocations.
1539 // Free graph data structures that are only needed for
1540 // unpacked 1-D storage.
1541 if (verbose) {
1542 std::ostringstream os;
1543 os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1544 << myGraph_->k_numRowEntries_.extent(0) << endl;
1545 std::cerr << os.str();
1546 }
1547
1548 myGraph_->k_numRowEntries_ = row_entries_type ();
1549
1550 // Keep the new 1-D packed allocations.
1551 // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1552 // We directly set the memory spaces to avoid a memcpy from device to host
1553 myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1554 myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1555 myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1556 valuesUnpacked_wdv = valuesPacked_wdv;
1557
1558 myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1559 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1560 }
1561 else {
1562 if (verbose) {
1563 std::ostringstream os;
1564 os << *prefix << "User requested NOT to optimize storage"
1565 << endl;
1566 std::cerr << os.str();
1567 }
1568 }
1569 }
1570
1571 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1572 void
1574 fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1575 {
1576 using ::Tpetra::Details::ProfilingRegion;
1577 using Teuchos::ArrayRCP;
1578 using Teuchos::Array;
1579 using Teuchos::null;
1580 using Teuchos::RCP;
1581 using Teuchos::rcp;
1582 using std::endl;
1583 using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1584 using non_const_row_map_type = typename row_map_type::non_const_type;
1585 using values_type = typename local_matrix_device_type::values_type;
1586 ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1587 const size_t lclNumRows = getLocalNumRows();
1588
1589 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1590 std::unique_ptr<std::string> prefix;
1591 if (verbose) {
1592 prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1593 std::ostringstream os;
1594 os << *prefix << "lclNumRows: " << lclNumRows << endl;
1595 std::cerr << os.str ();
1596 }
1597
1598 // The goals of this routine are first, to allocate and fill
1599 // packed 1-D storage (see below for an explanation) in the vals
1600 // array, and second, to give vals to the local matrix and
1601 // finalize the local matrix. We only need k_ptrs, the packed 1-D
1602 // row offsets, within the scope of this routine, since we're only
1603 // filling the local matrix here (use fillLocalGraphAndMatrix() to
1604 // fill both the graph and the matrix at the same time).
1605
1606 // get data from staticGraph_
1607 size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1608 size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1609 row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1610
1611 row_map_type k_ptrs; // "packed" row offsets array
1612 values_type k_vals; // "packed" values array
1613
1614 // May we ditch the old allocations for the packed (and otherwise
1615 // "optimized") allocations, later in this routine? Request
1616 // optimized storage by default.
1617 bool requestOptimizedStorage = true;
1618 const bool default_OptimizeStorage =
1619 ! isStaticGraph() || staticGraph_->isStorageOptimized();
1620 if (! params.is_null() &&
1621 ! params->get("Optimize Storage", default_OptimizeStorage)) {
1622 requestOptimizedStorage = false;
1623 }
1624 // If we're not allowed to change a static graph, then we can't
1625 // change the storage of the matrix, either. This means that if
1626 // the graph's storage isn't already optimized, we can't optimize
1627 // the matrix's storage either. Check and give warning, as
1628 // appropriate.
1629 if (! staticGraph_->isStorageOptimized () &&
1630 requestOptimizedStorage) {
1632 (true, std::runtime_error, "You requested optimized storage "
1633 "by setting the \"Optimize Storage\" flag to \"true\" in "
1634 "the ParameterList, or by virtue of default behavior. "
1635 "However, the associated CrsGraph was filled separately and "
1636 "requested not to optimize storage. Therefore, the "
1637 "CrsMatrix cannot optimize storage.");
1638 requestOptimizedStorage = false;
1639 }
1640
1641 using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1642
1643 // The matrix's values are currently
1644 // stored in a 1-D format. However, this format is "unpacked";
1645 // it doesn't necessarily have the same row offsets as indicated
1646 // by the ptrs array returned by allocRowPtrs. This could
1647 // happen, for example, if the user
1648 // fixed the number of matrix entries in
1649 // each row, but didn't fill all those entries.
1650 //
1651 // As above, we don't need to keep the "packed" row offsets
1652 // array ptrs here, but we do need it here temporarily, so we
1653 // have to allocate it. We'll free ptrs later in this method.
1654 //
1655 // Note that this routine checks whether storage has already
1656 // been packed. This is a common case for solution of nonlinear
1657 // PDEs using the finite element method, as long as the
1658 // structure of the sparse matrix does not change between linear
1659 // solves.
1660 if (nodeNumEntries != nodeNumAllocated) {
1661 if (verbose) {
1662 std::ostringstream os;
1663 os << *prefix << "Unpacked 1-D storage: numEnt="
1664 << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1665 << endl;
1666 std::cerr << os.str();
1667 }
1668 // We have to pack the 1-D storage, since the user didn't fill
1669 // up all requested storage.
1670 if (verbose) {
1671 std::ostringstream os;
1672 os << *prefix << "Allocate packed row offsets: "
1673 << (lclNumRows+1) << endl;
1674 std::cerr << os.str();
1675 }
1676 non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1677 lclNumRows+1);
1678 // Total number of entries in the matrix on the calling
1679 // process. We will compute this in the loop below. It's
1680 // cheap to compute and useful as a sanity check.
1681 size_t lclTotalNumEntries = 0;
1682 k_ptrs = tmpk_ptrs;
1683 {
1684 typename row_entries_type::const_type numRowEnt_h =
1685 staticGraph_->k_numRowEntries_;
1686 // This function can handle the counts being a host View.
1687 lclTotalNumEntries =
1688 Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1689 }
1690
1691 // Allocate the "packed" values array.
1692 // It has exactly the right number of entries.
1693 if (verbose) {
1694 std::ostringstream os;
1695 os << *prefix << "Allocate packed values: "
1696 << lclTotalNumEntries << endl;
1697 std::cerr << os.str ();
1698 }
1699 k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1700
1701 // Pack values_wdv into k_vals. We will replace values_wdv below.
1702 pack_functor<
1703 typename values_type::non_const_type,
1704 typename values_type::const_type,
1705 typename row_map_type::non_const_type,
1706 typename row_map_type::const_type> valsPacker
1707 (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1708 tmpk_ptrs, k_rowPtrs);
1709
1710 using exec_space = typename decltype (k_vals)::execution_space;
1711 using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1712 Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1713 range_type (0, lclNumRows), valsPacker);
1714 valuesPacked_wdv = values_wdv_type(k_vals);
1715 }
1716 else { // We don't have to pack, so just set the pointer.
1717 valuesPacked_wdv = valuesUnpacked_wdv;
1718 if (verbose) {
1719 std::ostringstream os;
1720 os << *prefix << "Storage already packed: "
1721 << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1722 std::cerr << os.str();
1723 }
1724 }
1725
1726 // May we ditch the old allocations for the packed one?
1727 if (requestOptimizedStorage) {
1728 // The user requested optimized storage, so we can dump the
1729 // unpacked 1-D storage, and keep the packed storage.
1730 valuesUnpacked_wdv = valuesPacked_wdv;
1731// k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1732 this->storageStatus_ = Details::STORAGE_1D_PACKED;
1733 }
1734 }
1735
1736 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1737 void
1739 insertIndicesAndValues (crs_graph_type& graph,
1740 RowInfo& rowInfo,
1741 const typename crs_graph_type::SLocalGlobalViews& newInds,
1742 const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1743 const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1744 const ELocalGlobal lg,
1745 const ELocalGlobal I)
1746 {
1747 const size_t oldNumEnt = rowInfo.numEntries;
1748 const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1749
1750 // Use of memcpy here works around an issue with GCC >= 4.9.0,
1751 // that probably relates to scalar_type vs. impl_scalar_type
1752 // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1753 // details; look for GCC_WORKAROUND macro definition.
1754 if (numInserted > 0) {
1755 const size_t startOffset = oldNumEnt;
1756 memcpy (&oldRowVals[startOffset], &newRowVals[0],
1757 numInserted * sizeof (impl_scalar_type));
1758 }
1759 }
1760
1761 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1762 void
1764 insertLocalValues (const LocalOrdinal lclRow,
1765 const Teuchos::ArrayView<const LocalOrdinal>& indices,
1766 const Teuchos::ArrayView<const Scalar>& values,
1767 const CombineMode CM)
1768 {
1769 using std::endl;
1770 const char tfecfFuncName[] = "insertLocalValues: ";
1771
1772 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1773 (! this->isFillActive (), std::runtime_error,
1774 "Fill is not active. After calling fillComplete, you must call "
1775 "resumeFill before you may insert entries into the matrix again.");
1776 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1777 (this->isStaticGraph (), std::runtime_error,
1778 "Cannot insert indices with static graph; use replaceLocalValues() "
1779 "instead.");
1780 // At this point, we know that myGraph_ is nonnull.
1781 crs_graph_type& graph = * (this->myGraph_);
1782 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1783 (graph.colMap_.is_null (), std::runtime_error,
1784 "Cannot insert local indices without a column map.");
1785 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1786 (graph.isGloballyIndexed (),
1787 std::runtime_error, "Graph indices are global; use "
1788 "insertGlobalValues().");
1789 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1790 (values.size () != indices.size (), std::runtime_error,
1791 "values.size() = " << values.size ()
1792 << " != indices.size() = " << indices.size () << ".");
1793 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1794 ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1795 "Local row index " << lclRow << " does not belong to this process.");
1796
1797 if (! graph.indicesAreAllocated ()) {
1798 // We only allocate values at most once per process, so it's OK
1799 // to check TPETRA_VERBOSE here.
1800 const bool verbose = Details::Behavior::verbose("CrsMatrix");
1801 this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1802 }
1803
1804#ifdef HAVE_TPETRA_DEBUG
1805 const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1806 // In a debug build, test whether any of the given column indices
1807 // are not in the column Map. Keep track of the invalid column
1808 // indices so we can tell the user about them.
1809 {
1810 using Teuchos::toString;
1811
1812 const map_type& colMap = * (graph.colMap_);
1813 Teuchos::Array<LocalOrdinal> badColInds;
1814 bool allInColMap = true;
1815 for (size_t k = 0; k < numEntriesToAdd; ++k) {
1816 if (! colMap.isNodeLocalElement (indices[k])) {
1817 allInColMap = false;
1818 badColInds.push_back (indices[k]);
1819 }
1820 }
1821 if (! allInColMap) {
1822 std::ostringstream os;
1823 os << "You attempted to insert entries in owned row " << lclRow
1824 << ", at the following column indices: " << toString (indices)
1825 << "." << endl;
1826 os << "Of those, the following indices are not in the column Map on "
1827 "this process: " << toString (badColInds) << "." << endl << "Since "
1828 "the matrix has a column Map already, it is invalid to insert "
1829 "entries at those locations.";
1830 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1831 (true, std::invalid_argument, os.str ());
1832 }
1833 }
1834#endif // HAVE_TPETRA_DEBUG
1835
1836 RowInfo rowInfo = graph.getRowInfo (lclRow);
1837
1838 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1839 if (CM == ADD) {
1840 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1841 valsView[offset] += values[k]; };
1842 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1843 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1844 } else if (CM == INSERT) {
1845 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1846 valsView[offset] = values[k]; };
1847 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1848 graph.insertLocalIndicesImpl(lclRow, indices, cb);
1849 } else {
1850 std::ostringstream os;
1851 os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1852 << "but this has not been implemented." << endl;
1853 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1854 (true, std::invalid_argument, os.str ());
1855 }
1856 }
1857
1858 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1859 void
1861 insertLocalValues (const LocalOrdinal localRow,
1862 const LocalOrdinal numEnt,
1863 const Scalar vals[],
1864 const LocalOrdinal cols[],
1865 const CombineMode CM)
1866 {
1867 Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1868 Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1869 this->insertLocalValues (localRow, colsT, valsT, CM);
1870 }
1871
1872 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1873 void
1876 RowInfo& rowInfo,
1877 const GlobalOrdinal gblColInds[],
1878 const impl_scalar_type vals[],
1879 const size_t numInputEnt)
1880 {
1881#ifdef HAVE_TPETRA_DEBUG
1882 const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1883 const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1884 const size_t curNumEnt = rowInfo.numEntries;
1885#endif // HAVE_TPETRA_DEBUG
1886
1887 if (! graph.indicesAreAllocated ()) {
1888 // We only allocate values at most once per process, so it's OK
1889 // to check TPETRA_VERBOSE here.
1890 using ::Tpetra::Details::Behavior;
1891 const bool verbose = Behavior::verbose("CrsMatrix");
1892 this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1893 // mfh 23 Jul 2017: allocateValues invalidates existing
1894 // getRowInfo results. Once we get rid of lazy graph
1895 // allocation, we'll be able to move the getRowInfo call outside
1896 // of this method.
1897 rowInfo = graph.getRowInfo (rowInfo.localRow);
1898 }
1899
1900 auto valsView = this->getValuesViewHostNonConst(rowInfo);
1901 auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1902 valsView[offset] += vals[k];
1903 };
1904 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1905#ifdef HAVE_TPETRA_DEBUG
1906 //numInserted is only used inside the debug code below.
1907 auto numInserted =
1908#endif
1909 graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1910
1911#ifdef HAVE_TPETRA_DEBUG
1912 size_t newNumEnt = curNumEnt + numInserted;
1913 const size_t chkNewNumEnt =
1914 graph.getNumEntriesInLocalRow (rowInfo.localRow);
1915 if (chkNewNumEnt != newNumEnt) {
1916 std::ostringstream os;
1917 os << std::endl << "newNumEnt = " << newNumEnt
1918 << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1919 << ") = " << chkNewNumEnt << "." << std::endl
1920 << "\torigNumEnt: " << origNumEnt << std::endl
1921 << "\tnumInputEnt: " << numInputEnt << std::endl
1922 << "\tgblColInds: [";
1923 for (size_t k = 0; k < numInputEnt; ++k) {
1924 os << gblColInds[k];
1925 if (k + size_t (1) < numInputEnt) {
1926 os << ",";
1927 }
1928 }
1929 os << "]" << std::endl
1930 << "\tvals: [";
1931 for (size_t k = 0; k < numInputEnt; ++k) {
1932 os << vals[k];
1933 if (k + size_t (1) < numInputEnt) {
1934 os << ",";
1935 }
1936 }
1937 os << "]" << std::endl;
1938
1939 if (this->supportsRowViews ()) {
1940 values_host_view_type vals2;
1941 if (this->isGloballyIndexed ()) {
1942 global_inds_host_view_type gblColInds2;
1943 const GlobalOrdinal gblRow =
1944 graph.rowMap_->getGlobalElement (rowInfo.localRow);
1945 if (gblRow ==
1946 Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1947 os << "Local row index " << rowInfo.localRow << " is invalid!"
1948 << std::endl;
1949 }
1950 else {
1951 bool getViewThrew = false;
1952 try {
1953 this->getGlobalRowView (gblRow, gblColInds2, vals2);
1954 }
1955 catch (std::exception& e) {
1956 getViewThrew = true;
1957 os << "getGlobalRowView threw exception:" << std::endl
1958 << e.what () << std::endl;
1959 }
1960 if (! getViewThrew) {
1961 os << "\tNew global column indices: ";
1962 for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1963 os << gblColInds2[jjj] << " ";
1964 os << std::endl;
1965 os << "\tNew values: ";
1966 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1967 os << vals2[jjj] << " ";
1968 os << std::endl;
1969 }
1970 }
1971 }
1972 else if (this->isLocallyIndexed ()) {
1973 local_inds_host_view_type lclColInds2;
1974 this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1975 os << "\tNew local column indices: ";
1976 for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1977 os << lclColInds2[jjj] << " ";
1978 os << std::endl;
1979 os << "\tNew values: ";
1980 for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1981 os << vals2[jjj] << " ";
1982 os << std::endl;
1983 }
1984 }
1985
1986 os << "Please report this bug to the Tpetra developers.";
1987 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1988 (true, std::logic_error, os.str ());
1989 }
1990#endif // HAVE_TPETRA_DEBUG
1991 }
1992
1993 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1994 void
1996 insertGlobalValues (const GlobalOrdinal gblRow,
1997 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1998 const Teuchos::ArrayView<const Scalar>& values)
1999 {
2000 using Teuchos::toString;
2001 using std::endl;
2002 typedef impl_scalar_type IST;
2003 typedef LocalOrdinal LO;
2004 typedef GlobalOrdinal GO;
2005 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2006 typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2007 const char tfecfFuncName[] = "insertGlobalValues: ";
2008
2009#ifdef HAVE_TPETRA_DEBUG
2010 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2011 (values.size () != indices.size (), std::runtime_error,
2012 "values.size() = " << values.size () << " != indices.size() = "
2013 << indices.size () << ".");
2014#endif // HAVE_TPETRA_DEBUG
2015
2016 // getRowMap() is not thread safe, because it increments RCP's
2017 // reference count. getCrsGraphRef() is thread safe.
2018 const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2019 const LO lclRow = rowMap.getLocalElement (gblRow);
2020
2021 if (lclRow == OTLO::invalid ()) {
2022 // Input row is _not_ owned by the calling process.
2023 //
2024 // See a note (now deleted) from mfh 14 Dec 2012: If input row
2025 // is not in the row Map, it doesn't matter whether or not the
2026 // graph is static; the data just get stashed for later use by
2027 // globalAssemble().
2028 this->insertNonownedGlobalValues (gblRow, indices, values);
2029 }
2030 else { // Input row _is_ owned by the calling process
2031 if (this->isStaticGraph ()) {
2032 // Uh oh! Not allowed to insert into owned rows in that case.
2033 const int myRank = rowMap.getComm ()->getRank ();
2034 const int numProcs = rowMap.getComm ()->getSize ();
2035 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2036 (true, std::runtime_error,
2037 "The matrix was constructed with a constant (\"static\") graph, "
2038 "yet the given global row index " << gblRow << " is in the row "
2039 "Map on the calling process (with rank " << myRank << ", of " <<
2040 numProcs << " process(es)). In this case, you may not insert "
2041 "new entries into rows owned by the calling process.");
2042 }
2043
2044 crs_graph_type& graph = * (this->myGraph_);
2045 const IST* const inputVals =
2046 reinterpret_cast<const IST*> (values.getRawPtr ());
2047 const GO* const inputGblColInds = indices.getRawPtr ();
2048 const size_t numInputEnt = indices.size ();
2049 RowInfo rowInfo = graph.getRowInfo (lclRow);
2050
2051 // If the matrix has a column Map, check at this point whether
2052 // the column indices belong to the column Map.
2053 //
2054 // FIXME (mfh 16 May 2013) We may want to consider deferring the
2055 // test to the CrsGraph method, since it may have to do this
2056 // anyway.
2057 if (! graph.colMap_.is_null ()) {
2058 const map_type& colMap = * (graph.colMap_);
2059 // In a debug build, keep track of the nonowned ("bad") column
2060 // indices, so that we can display them in the exception
2061 // message. In a release build, just ditch the loop early if
2062 // we encounter a nonowned column index.
2063#ifdef HAVE_TPETRA_DEBUG
2064 Teuchos::Array<GO> badColInds;
2065#endif // HAVE_TPETRA_DEBUG
2066 const size_type numEntriesToInsert = indices.size ();
2067 bool allInColMap = true;
2068 for (size_type k = 0; k < numEntriesToInsert; ++k) {
2069 if (! colMap.isNodeGlobalElement (indices[k])) {
2070 allInColMap = false;
2071#ifdef HAVE_TPETRA_DEBUG
2072 badColInds.push_back (indices[k]);
2073#else
2074 break;
2075#endif // HAVE_TPETRA_DEBUG
2076 }
2077 }
2078 if (! allInColMap) {
2079 std::ostringstream os;
2080 os << "You attempted to insert entries in owned row " << gblRow
2081 << ", at the following column indices: " << toString (indices)
2082 << "." << endl;
2083#ifdef HAVE_TPETRA_DEBUG
2084 os << "Of those, the following indices are not in the column Map "
2085 "on this process: " << toString (badColInds) << "." << endl
2086 << "Since the matrix has a column Map already, it is invalid "
2087 "to insert entries at those locations.";
2088#else
2089 os << "At least one of those indices is not in the column Map "
2090 "on this process." << endl << "It is invalid to insert into "
2091 "columns not in the column Map on the process that owns the "
2092 "row.";
2093#endif // HAVE_TPETRA_DEBUG
2094 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2095 (true, std::invalid_argument, os.str ());
2096 }
2097 }
2098
2099 this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2100 inputVals, numInputEnt);
2101 }
2102 }
2103
2104
2105 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2106 void
2108 insertGlobalValues (const GlobalOrdinal globalRow,
2109 const LocalOrdinal numEnt,
2110 const Scalar vals[],
2111 const GlobalOrdinal inds[])
2112 {
2113 Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2114 Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2115 this->insertGlobalValues (globalRow, indsT, valsT);
2116 }
2117
2118
2119 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2120 void
2123 const GlobalOrdinal gblRow,
2124 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2125 const Teuchos::ArrayView<const Scalar>& values,
2126 const bool debug)
2127 {
2128 typedef impl_scalar_type IST;
2129 typedef LocalOrdinal LO;
2130 typedef GlobalOrdinal GO;
2131 typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2132 const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2133
2134 if (debug) {
2135 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2136 (values.size () != indices.size (), std::runtime_error,
2137 "values.size() = " << values.size () << " != indices.size() = "
2138 << indices.size () << ".");
2139 }
2140
2141 // getRowMap() is not thread safe, because it increments RCP's
2142 // reference count. getCrsGraphRef() is thread safe.
2143 const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2144 const LO lclRow = rowMap.getLocalElement (gblRow);
2145 if (lclRow == OTLO::invalid ()) {
2146 // Input row is _not_ owned by the calling process.
2147 //
2148 // See a note (now deleted) from mfh 14 Dec 2012: If input row
2149 // is not in the row Map, it doesn't matter whether or not the
2150 // graph is static; the data just get stashed for later use by
2151 // globalAssemble().
2152 this->insertNonownedGlobalValues (gblRow, indices, values);
2153 }
2154 else { // Input row _is_ owned by the calling process
2155 if (this->isStaticGraph ()) {
2156 // Uh oh! Not allowed to insert into owned rows in that case.
2157 const int myRank = rowMap.getComm ()->getRank ();
2158 const int numProcs = rowMap.getComm ()->getSize ();
2159 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2160 (true, std::runtime_error,
2161 "The matrix was constructed with a constant (\"static\") graph, "
2162 "yet the given global row index " << gblRow << " is in the row "
2163 "Map on the calling process (with rank " << myRank << ", of " <<
2164 numProcs << " process(es)). In this case, you may not insert "
2165 "new entries into rows owned by the calling process.");
2166 }
2167
2168 crs_graph_type& graph = * (this->myGraph_);
2169 const IST* const inputVals =
2170 reinterpret_cast<const IST*> (values.getRawPtr ());
2171 const GO* const inputGblColInds = indices.getRawPtr ();
2172 const size_t numInputEnt = indices.size ();
2173 RowInfo rowInfo = graph.getRowInfo (lclRow);
2174
2175 if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2176 // This branch is similar in function to the following branch, but for
2177 // the special case that the target graph is locally indexed.
2178 // In this case, we cannot simply filter
2179 // out global indices that don't exist on the receiving process and
2180 // insert the remaining (global) indices, but we must convert them (the
2181 // remaining global indices) to local and call `insertLocalValues`.
2182 const map_type& colMap = * (graph.colMap_);
2183 size_t curOffset = 0;
2184 while (curOffset < numInputEnt) {
2185 // Find a sequence of input indices that are in the column Map on the
2186 // calling process. Doing a sequence at a time, instead of one at a
2187 // time, amortizes some overhead.
2188 Teuchos::Array<LO> lclIndices;
2189 size_t endOffset = curOffset;
2190 for ( ; endOffset < numInputEnt; ++endOffset) {
2191 auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2192 if (lclIndex != OTLO::invalid())
2193 lclIndices.push_back(lclIndex);
2194 else
2195 break;
2196 }
2197 // curOffset, endOffset: half-exclusive range of indices in the column
2198 // Map on the calling process. If endOffset == curOffset, the range is
2199 // empty.
2200 const LO numIndInSeq = (endOffset - curOffset);
2201 if (numIndInSeq != 0) {
2202 this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2203 }
2204 // Invariant before the increment line: Either endOffset ==
2205 // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2206 // on the calling process.
2207 if (debug) {
2208 const bool invariant = endOffset == numInputEnt ||
2209 colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2210 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2211 (! invariant, std::logic_error, std::endl << "Invariant failed!");
2212 }
2213 curOffset = endOffset + 1;
2214 }
2215 }
2216 else if (! graph.colMap_.is_null ()) { // We have a column Map.
2217 const map_type& colMap = * (graph.colMap_);
2218 size_t curOffset = 0;
2219 while (curOffset < numInputEnt) {
2220 // Find a sequence of input indices that are in the column
2221 // Map on the calling process. Doing a sequence at a time,
2222 // instead of one at a time, amortizes some overhead.
2223 size_t endOffset = curOffset;
2224 for ( ; endOffset < numInputEnt &&
2225 colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2226 ++endOffset)
2227 {}
2228 // curOffset, endOffset: half-exclusive range of indices in
2229 // the column Map on the calling process. If endOffset ==
2230 // curOffset, the range is empty.
2231 const LO numIndInSeq = (endOffset - curOffset);
2232 if (numIndInSeq != 0) {
2233 rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2234 this->insertGlobalValuesImpl (graph, rowInfo,
2235 inputGblColInds + curOffset,
2236 inputVals + curOffset,
2237 numIndInSeq);
2238 }
2239 // Invariant before the increment line: Either endOffset ==
2240 // numInputEnt, or inputGblColInds[endOffset] is not in the
2241 // column Map on the calling process.
2242 if (debug) {
2243 const bool invariant = endOffset == numInputEnt ||
2244 colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2245 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2246 (! invariant, std::logic_error, std::endl << "Invariant failed!");
2247 }
2248 curOffset = endOffset + 1;
2250 }
2251 else { // we don't have a column Map.
2252 this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2253 inputVals, numInputEnt);
2254 }
2255 }
2256 }
2257
2258 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2259 void
2260 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2261 insertGlobalValuesFilteredChecked(
2262 const GlobalOrdinal gblRow,
2263 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2264 const Teuchos::ArrayView<const Scalar>& values,
2265 const char* const prefix,
2266 const bool debug,
2267 const bool verbose)
2268 {
2269 using Details::verbosePrintArray;
2270 using std::endl;
2271
2272 try {
2273 insertGlobalValuesFiltered(gblRow, indices, values, debug);
2274 }
2275 catch(std::exception& e) {
2276 std::ostringstream os;
2277 if (verbose) {
2278 const size_t maxNumToPrint =
2279 Details::Behavior::verbosePrintCountThreshold();
2280 os << *prefix << ": insertGlobalValuesFiltered threw an "
2281 "exception: " << e.what() << endl
2282 << "Global row index: " << gblRow << endl;
2283 verbosePrintArray(os, indices, "Global column indices",
2284 maxNumToPrint);
2285 os << endl;
2286 verbosePrintArray(os, values, "Values", maxNumToPrint);
2287 os << endl;
2288 }
2289 else {
2290 os << ": insertGlobalValuesFiltered threw an exception: "
2291 << e.what();
2292 }
2293 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2294 }
2295 }
2296
2297 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2298 LocalOrdinal
2301 const crs_graph_type& graph,
2302 const RowInfo& rowInfo,
2303 const LocalOrdinal inds[],
2304 const impl_scalar_type newVals[],
2305 const LocalOrdinal numElts)
2306 {
2307 typedef LocalOrdinal LO;
2308 typedef GlobalOrdinal GO;
2309 const bool sorted = graph.isSorted ();
2310
2311 size_t hint = 0; // Guess for the current index k into rowVals
2312 LO numValid = 0; // number of valid local column indices
2313
2314 if (graph.isLocallyIndexed ()) {
2315 // Get a view of the column indices in the row. This amortizes
2316 // the cost of getting the view over all the entries of inds.
2317 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2318
2319 for (LO j = 0; j < numElts; ++j) {
2320 const LO lclColInd = inds[j];
2321 const size_t offset =
2322 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2323 lclColInd, hint, sorted);
2324 if (offset != rowInfo.numEntries) {
2325 rowVals[offset] = newVals[j];
2326 hint = offset + 1;
2327 ++numValid;
2328 }
2329 }
2330 }
2331 else if (graph.isGloballyIndexed ()) {
2332 if (graph.colMap_.is_null ()) {
2333 return Teuchos::OrdinalTraits<LO>::invalid ();
2334 }
2335 const map_type colMap = * (graph.colMap_);
2336
2337 // Get a view of the column indices in the row. This amortizes
2338 // the cost of getting the view over all the entries of inds.
2339 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2340
2341 for (LO j = 0; j < numElts; ++j) {
2342 const GO gblColInd = colMap.getGlobalElement (inds[j]);
2343 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2344 const size_t offset =
2345 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2346 gblColInd, hint, sorted);
2347 if (offset != rowInfo.numEntries) {
2348 rowVals[offset] = newVals[j];
2349 hint = offset + 1;
2350 ++numValid;
2351 }
2352 }
2353 }
2354 }
2355 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2356 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2357 // to be neither locally nor globally indexed on a process.
2358 // This means that the graph or matrix has no entries on that
2359 // process. Epetra also works like this. It's related to lazy
2360 // allocation (on first insertion, not at graph / matrix
2361 // construction). Lazy allocation will go away because it is
2362 // not thread scalable.
2363
2364 return numValid;
2365 }
2366
2367 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2368 LocalOrdinal
2370 replaceLocalValues (const LocalOrdinal localRow,
2371 const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2372 const Teuchos::ArrayView<const Scalar>& vals)
2373 {
2374 typedef LocalOrdinal LO;
2375
2376 const LO numInputEnt = static_cast<LO> (lclCols.size ());
2377 if (static_cast<LO> (vals.size ()) != numInputEnt) {
2378 return Teuchos::OrdinalTraits<LO>::invalid ();
2379 }
2380 const LO* const inputInds = lclCols.getRawPtr ();
2381 const Scalar* const inputVals = vals.getRawPtr ();
2382 return this->replaceLocalValues (localRow, numInputEnt,
2383 inputVals, inputInds);
2384 }
2385
2386 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2388 local_ordinal_type
2391 const local_ordinal_type localRow,
2392 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2393 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2394 {
2396 const LO numInputEnt = inputInds.extent(0);
2397 if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2398 return Teuchos::OrdinalTraits<LO>::invalid();
2399 }
2400 const Scalar* const inVals =
2401 reinterpret_cast<const Scalar*>(inputVals.data());
2402 return this->replaceLocalValues(localRow, numInputEnt,
2403 inVals, inputInds.data());
2405
2406 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2407 LocalOrdinal
2409 replaceLocalValues (const LocalOrdinal localRow,
2410 const LocalOrdinal numEnt,
2411 const Scalar inputVals[],
2412 const LocalOrdinal inputCols[])
2413 {
2414 typedef impl_scalar_type IST;
2415 typedef LocalOrdinal LO;
2416
2417 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2418 // Fill must be active and the "nonconst" graph must exist.
2419 return Teuchos::OrdinalTraits<LO>::invalid ();
2420 }
2421 const crs_graph_type& graph = * (this->staticGraph_);
2422 const RowInfo rowInfo = graph.getRowInfo (localRow);
2423
2424 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2425 // The calling process does not own this row, so it is not
2426 // allowed to modify its values.
2427 return static_cast<LO> (0);
2428 }
2429 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2430 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2431 return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2432 inputCols, inVals, numEnt);
2433 }
2434
2435 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2436 LocalOrdinal
2439 const crs_graph_type& graph,
2440 const RowInfo& rowInfo,
2441 const GlobalOrdinal inds[],
2442 const impl_scalar_type newVals[],
2443 const LocalOrdinal numElts)
2444 {
2445 Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2446 auto fun =
2447 [&](size_t const k, size_t const /*start*/, size_t const offset) {
2448 rowVals[offset] = newVals[k];
2449 };
2450 std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2451 return graph.findGlobalIndices(rowInfo, indsT, cb);
2453
2454 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2455 LocalOrdinal
2457 replaceGlobalValues (const GlobalOrdinal globalRow,
2458 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2459 const Teuchos::ArrayView<const Scalar>& inputVals)
2460 {
2461 typedef LocalOrdinal LO;
2462
2463 const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2464 if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2465 return Teuchos::OrdinalTraits<LO>::invalid ();
2466 }
2467 return this->replaceGlobalValues (globalRow, numInputEnt,
2468 inputVals.getRawPtr (),
2469 inputGblColInds.getRawPtr ());
2470 }
2471
2472 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2473 LocalOrdinal
2475 replaceGlobalValues (const GlobalOrdinal globalRow,
2476 const LocalOrdinal numEnt,
2477 const Scalar inputVals[],
2478 const GlobalOrdinal inputGblColInds[])
2479 {
2480 typedef impl_scalar_type IST;
2481 typedef LocalOrdinal LO;
2482
2483 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2484 // Fill must be active and the "nonconst" graph must exist.
2485 return Teuchos::OrdinalTraits<LO>::invalid ();
2486 }
2487 const crs_graph_type& graph = * (this->staticGraph_);
2488
2489 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2490 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2491 // The input local row is invalid on the calling process,
2492 // which means that the calling process summed 0 entries.
2493 return static_cast<LO> (0);
2494 }
2495
2496 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2497 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2498 return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2499 inputGblColInds, inVals, numEnt);
2501
2502 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2504 local_ordinal_type
2507 const global_ordinal_type globalRow,
2508 const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2509 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2510 {
2511 // We use static_assert here to check the template parameters,
2512 // rather than std::enable_if (e.g., on the return value, to
2513 // enable compilation only if the template parameters match the
2514 // desired attributes). This turns obscure link errors into
2515 // clear compilation errors. It also makes the return value a
2516 // lot easier to see.
2517 using LO = local_ordinal_type;
2518 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2519 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2520 return Teuchos::OrdinalTraits<LO>::invalid();
2522 const Scalar* const inVals =
2523 reinterpret_cast<const Scalar*>(inputVals.data());
2524 return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2525 inputInds.data());
2526 }
2527
2528 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2529 LocalOrdinal
2532 const crs_graph_type& graph,
2533 const RowInfo& rowInfo,
2534 const GlobalOrdinal inds[],
2535 const impl_scalar_type newVals[],
2536 const LocalOrdinal numElts,
2537 const bool atomic)
2538 {
2539 typedef LocalOrdinal LO;
2540 typedef GlobalOrdinal GO;
2541
2542 const bool sorted = graph.isSorted ();
2543
2544 size_t hint = 0; // guess at the index's relative offset in the row
2545 LO numValid = 0; // number of valid input column indices
2546
2547 if (graph.isLocallyIndexed ()) {
2548 // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2549 // pointer does NOT change its reference count. Thus, this
2550 // code is still thread safe.
2551 if (graph.colMap_.is_null ()) {
2552 // NO input column indices are valid in this case, since if
2553 // the column Map is null on the calling process, then the
2554 // calling process owns no graph entries.
2555 return numValid;
2556 }
2557 const map_type& colMap = * (graph.colMap_);
2558
2559 // Get a view of the column indices in the row. This amortizes
2560 // the cost of getting the view over all the entries of inds.
2561 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2562 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2563
2564 for (LO j = 0; j < numElts; ++j) {
2565 const LO lclColInd = colMap.getLocalElement (inds[j]);
2566 if (lclColInd != LINV) {
2567 const size_t offset =
2568 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2569 lclColInd, hint, sorted);
2570 if (offset != rowInfo.numEntries) {
2571 if (atomic) {
2572 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2573 }
2574 else {
2575 rowVals[offset] += newVals[j];
2576 }
2577 hint = offset + 1;
2578 numValid++;
2579 }
2580 }
2581 }
2582 }
2583 else if (graph.isGloballyIndexed ()) {
2584 // Get a view of the column indices in the row. This amortizes
2585 // the cost of getting the view over all the entries of inds.
2586 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2587
2588 for (LO j = 0; j < numElts; ++j) {
2589 const GO gblColInd = inds[j];
2590 const size_t offset =
2591 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2592 gblColInd, hint, sorted);
2593 if (offset != rowInfo.numEntries) {
2594 if (atomic) {
2595 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2597 else {
2598 rowVals[offset] += newVals[j];
2599 }
2600 hint = offset + 1;
2601 numValid++;
2602 }
2603 }
2604 }
2605 // If the graph is neither locally nor globally indexed on the
2606 // calling process, that means the calling process has no graph
2607 // entries. Thus, none of the input column indices are valid.
2608
2609 return numValid;
2610 }
2611
2612 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2613 LocalOrdinal
2615 sumIntoGlobalValues (const GlobalOrdinal gblRow,
2616 const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2617 const Teuchos::ArrayView<const Scalar>& inputVals,
2618 const bool atomic)
2619 {
2620 typedef LocalOrdinal LO;
2621
2622 const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2623 if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2624 return Teuchos::OrdinalTraits<LO>::invalid ();
2625 }
2626 return this->sumIntoGlobalValues (gblRow, numInputEnt,
2627 inputVals.getRawPtr (),
2628 inputGblColInds.getRawPtr (),
2629 atomic);
2630 }
2631
2632 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2633 LocalOrdinal
2635 sumIntoGlobalValues (const GlobalOrdinal gblRow,
2636 const LocalOrdinal numInputEnt,
2637 const Scalar inputVals[],
2638 const GlobalOrdinal inputGblColInds[],
2639 const bool atomic)
2640 {
2641 typedef impl_scalar_type IST;
2642 typedef LocalOrdinal LO;
2643 typedef GlobalOrdinal GO;
2644
2645 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2646 // Fill must be active and the "nonconst" graph must exist.
2647 return Teuchos::OrdinalTraits<LO>::invalid ();
2648 }
2649 const crs_graph_type& graph = * (this->staticGraph_);
2650
2651 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2652 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2653 // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2654 // thread safe in a debug build, in part because it uses
2655 // Teuchos::ArrayView, and in part because of the data structure
2656 // used to stash outgoing entries.
2657 using Teuchos::ArrayView;
2658 ArrayView<const GO> inputGblColInds_av(
2659 numInputEnt == 0 ? nullptr : inputGblColInds,
2660 numInputEnt);
2661 ArrayView<const Scalar> inputVals_av(
2662 numInputEnt == 0 ? nullptr :
2663 inputVals, numInputEnt);
2664 // gblRow is not in the row Map on the calling process, so stash
2665 // the given entries away in a separate data structure.
2666 // globalAssemble() (called during fillComplete()) will exchange
2667 // that data and sum it in using sumIntoGlobalValues().
2668 this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2669 inputVals_av);
2670 // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2671 // since we won't know whether the given indices were valid
2672 // until globalAssemble (called in fillComplete) is called.
2673 // That's why insertNonownedGlobalValues doesn't return
2674 // anything. Just for consistency, I'll return the number of
2675 // entries that the user gave us.
2676 return numInputEnt;
2677 }
2678 else { // input row is in the row Map on the calling process
2679 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2680 const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2681 return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2682 inputGblColInds, inVals,
2683 numInputEnt, atomic);
2684 }
2685 }
2686
2687 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2688 LocalOrdinal
2689 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2690 transformLocalValues (const LocalOrdinal lclRow,
2691 const LocalOrdinal numInputEnt,
2692 const impl_scalar_type inputVals[],
2693 const LocalOrdinal inputCols[],
2694 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2695 const bool atomic)
2696 {
2697 using Tpetra::Details::OrdinalTraits;
2698 typedef LocalOrdinal LO;
2699
2700 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2701 // Fill must be active and the "nonconst" graph must exist.
2702 return Teuchos::OrdinalTraits<LO>::invalid ();
2703 }
2704 const crs_graph_type& graph = * (this->staticGraph_);
2705 const RowInfo rowInfo = graph.getRowInfo (lclRow);
2706
2707 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2708 // The calling process does not own this row, so it is not
2709 // allowed to modify its values.
2710 return static_cast<LO> (0);
2711 }
2712 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2713 return this->transformLocalValues (curRowVals.data (), graph,
2714 rowInfo, inputCols, inputVals,
2715 numInputEnt, f, atomic);
2717
2718 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2719 LocalOrdinal
2721 transformGlobalValues (const GlobalOrdinal gblRow,
2722 const LocalOrdinal numInputEnt,
2723 const impl_scalar_type inputVals[],
2724 const GlobalOrdinal inputCols[],
2725 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2726 const bool atomic)
2727 {
2728 using Tpetra::Details::OrdinalTraits;
2729 typedef LocalOrdinal LO;
2730
2731 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2732 // Fill must be active and the "nonconst" graph must exist.
2733 return OrdinalTraits<LO>::invalid ();
2734 }
2735 const crs_graph_type& graph = * (this->staticGraph_);
2736 const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2737
2738 if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2739 // The calling process does not own this row, so it is not
2740 // allowed to modify its values.
2741 return static_cast<LO> (0);
2742 }
2743 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2744 return this->transformGlobalValues (curRowVals.data (), graph,
2745 rowInfo, inputCols, inputVals,
2746 numInputEnt, f, atomic);
2747 }
2748
2749 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2750 LocalOrdinal
2751 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2752 transformLocalValues (impl_scalar_type rowVals[],
2753 const crs_graph_type& graph,
2754 const RowInfo& rowInfo,
2755 const LocalOrdinal inds[],
2756 const impl_scalar_type newVals[],
2757 const LocalOrdinal numElts,
2758 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2759 const bool atomic)
2760 {
2761 typedef impl_scalar_type ST;
2762 typedef LocalOrdinal LO;
2763 typedef GlobalOrdinal GO;
2764
2765 //if (newVals.extent (0) != inds.extent (0)) {
2766 // The sizes of the input arrays must match.
2767 //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2768 //}
2769 //const LO numElts = static_cast<LO> (inds.extent (0));
2770 const bool sorted = graph.isSorted ();
2771
2772 LO numValid = 0; // number of valid input column indices
2773 size_t hint = 0; // Guess for the current index k into rowVals
2774
2775 if (graph.isLocallyIndexed ()) {
2776 // Get a view of the column indices in the row. This amortizes
2777 // the cost of getting the view over all the entries of inds.
2778 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2779
2780 for (LO j = 0; j < numElts; ++j) {
2781 const LO lclColInd = inds[j];
2782 const size_t offset =
2783 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2784 lclColInd, hint, sorted);
2785 if (offset != rowInfo.numEntries) {
2786 if (atomic) {
2787 // NOTE (mfh 30 Nov 2015) The commented-out code is
2788 // wrong because another thread may have changed
2789 // rowVals[offset] between those two lines of code.
2790 //
2791 //const ST newVal = f (rowVals[offset], newVals[j]);
2792 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2793
2794 volatile ST* const dest = &rowVals[offset];
2795 (void) atomic_binary_function_update (dest, newVals[j], f);
2796 }
2797 else {
2798 // use binary function f
2799 rowVals[offset] = f (rowVals[offset], newVals[j]);
2800 }
2801 hint = offset + 1;
2802 ++numValid;
2803 }
2804 }
2805 }
2806 else if (graph.isGloballyIndexed ()) {
2807 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2808 // pointer does NOT change its reference count. Thus, this
2809 // code is still thread safe.
2810 if (graph.colMap_.is_null ()) {
2811 // NO input column indices are valid in this case. Either
2812 // the column Map hasn't been set yet (so local indices
2813 // don't exist yet), or the calling process owns no graph
2814 // entries.
2815 return numValid;
2816 }
2817 const map_type& colMap = * (graph.colMap_);
2818 // Get a view of the column indices in the row. This amortizes
2819 // the cost of getting the view over all the entries of inds.
2820 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2821
2822 const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2823 for (LO j = 0; j < numElts; ++j) {
2824 const GO gblColInd = colMap.getGlobalElement (inds[j]);
2825 if (gblColInd != GINV) {
2826 const size_t offset =
2827 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2828 gblColInd, hint, sorted);
2829 if (offset != rowInfo.numEntries) {
2830 if (atomic) {
2831 // NOTE (mfh 30 Nov 2015) The commented-out code is
2832 // wrong because another thread may have changed
2833 // rowVals[offset] between those two lines of code.
2834 //
2835 //const ST newVal = f (rowVals[offset], newVals[j]);
2836 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2837
2838 volatile ST* const dest = &rowVals[offset];
2839 (void) atomic_binary_function_update (dest, newVals[j], f);
2840 }
2841 else {
2842 // use binary function f
2843 rowVals[offset] = f (rowVals[offset], newVals[j]);
2844 }
2845 hint = offset + 1;
2846 numValid++;
2847 }
2848 }
2849 }
2851 // If the graph is neither locally nor globally indexed on the
2852 // calling process, that means the calling process has no graph
2853 // entries. Thus, none of the input column indices are valid.
2854
2855 return numValid;
2856 }
2857
2858 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2859 LocalOrdinal
2862 const crs_graph_type& graph,
2863 const RowInfo& rowInfo,
2864 const GlobalOrdinal inds[],
2865 const impl_scalar_type newVals[],
2866 const LocalOrdinal numElts,
2867 std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2868 const bool atomic)
2869 {
2870 typedef impl_scalar_type ST;
2871 typedef LocalOrdinal LO;
2872 typedef GlobalOrdinal GO;
2873
2874 //if (newVals.extent (0) != inds.extent (0)) {
2875 // The sizes of the input arrays must match.
2876 //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2877 //}
2878 //const LO numElts = static_cast<LO> (inds.extent (0));
2879 const bool sorted = graph.isSorted ();
2880
2881 LO numValid = 0; // number of valid input column indices
2882 size_t hint = 0; // Guess for the current index k into rowVals
2883
2884 if (graph.isGloballyIndexed ()) {
2885 // Get a view of the column indices in the row. This amortizes
2886 // the cost of getting the view over all the entries of inds.
2887 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2888
2889 for (LO j = 0; j < numElts; ++j) {
2890 const GO gblColInd = inds[j];
2891 const size_t offset =
2892 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2893 gblColInd, hint, sorted);
2894 if (offset != rowInfo.numEntries) {
2895 if (atomic) {
2896 // NOTE (mfh 30 Nov 2015) The commented-out code is
2897 // wrong because another thread may have changed
2898 // rowVals[offset] between those two lines of code.
2899 //
2900 //const ST newVal = f (rowVals[offset], newVals[j]);
2901 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2902
2903 volatile ST* const dest = &rowVals[offset];
2904 (void) atomic_binary_function_update (dest, newVals[j], f);
2905 }
2906 else {
2907 // use binary function f
2908 rowVals[offset] = f (rowVals[offset], newVals[j]);
2909 }
2910 hint = offset + 1;
2911 ++numValid;
2912 }
2913 }
2914 }
2915 else if (graph.isLocallyIndexed ()) {
2916 // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2917 // pointer does NOT change its reference count. Thus, this
2918 // code is still thread safe.
2919 if (graph.colMap_.is_null ()) {
2920 // NO input column indices are valid in this case. Either the
2921 // column Map hasn't been set yet (so local indices don't
2922 // exist yet), or the calling process owns no graph entries.
2923 return numValid;
2924 }
2925 const map_type& colMap = * (graph.colMap_);
2926 // Get a view of the column indices in the row. This amortizes
2927 // the cost of getting the view over all the entries of inds.
2928 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2929
2930 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2931 for (LO j = 0; j < numElts; ++j) {
2932 const LO lclColInd = colMap.getLocalElement (inds[j]);
2933 if (lclColInd != LINV) {
2934 const size_t offset =
2935 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2936 lclColInd, hint, sorted);
2937 if (offset != rowInfo.numEntries) {
2938 if (atomic) {
2939 // NOTE (mfh 30 Nov 2015) The commented-out code is
2940 // wrong because another thread may have changed
2941 // rowVals[offset] between those two lines of code.
2942 //
2943 //const ST newVal = f (rowVals[offset], newVals[j]);
2944 //Kokkos::atomic_assign (&rowVals[offset], newVal);
2945
2946 volatile ST* const dest = &rowVals[offset];
2947 (void) atomic_binary_function_update (dest, newVals[j], f);
2948 }
2949 else {
2950 // use binary function f
2951 rowVals[offset] = f (rowVals[offset], newVals[j]);
2952 }
2953 hint = offset + 1;
2954 numValid++;
2955 }
2956 }
2957 }
2958 }
2959 // If the graph is neither locally nor globally indexed on the
2960 // calling process, that means the calling process has no graph
2961 // entries. Thus, none of the input column indices are valid.
2962
2963 return numValid;
2964 }
2965
2966 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2967 LocalOrdinal
2970 const crs_graph_type& graph,
2971 const RowInfo& rowInfo,
2972 const LocalOrdinal inds[],
2973 const impl_scalar_type newVals[],
2974 const LocalOrdinal numElts,
2975 const bool atomic)
2976 {
2977 typedef LocalOrdinal LO;
2978 typedef GlobalOrdinal GO;
2979
2980 const bool sorted = graph.isSorted ();
2981
2982 size_t hint = 0; // Guess for the current index k into rowVals
2983 LO numValid = 0; // number of valid local column indices
2984
2985 if (graph.isLocallyIndexed ()) {
2986 // Get a view of the column indices in the row. This amortizes
2987 // the cost of getting the view over all the entries of inds.
2988 auto colInds = graph.getLocalIndsViewHost (rowInfo);
2989
2990 for (LO j = 0; j < numElts; ++j) {
2991 const LO lclColInd = inds[j];
2992 const size_t offset =
2993 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2994 lclColInd, hint, sorted);
2995 if (offset != rowInfo.numEntries) {
2996 if (atomic) {
2997 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2998 }
2999 else {
3000 rowVals[offset] += newVals[j];
3001 }
3002 hint = offset + 1;
3003 ++numValid;
3004 }
3005 }
3006 }
3007 else if (graph.isGloballyIndexed ()) {
3008 if (graph.colMap_.is_null ()) {
3009 return Teuchos::OrdinalTraits<LO>::invalid ();
3010 }
3011 const map_type colMap = * (graph.colMap_);
3012
3013 // Get a view of the column indices in the row. This amortizes
3014 // the cost of getting the view over all the entries of inds.
3015 auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3016
3017 for (LO j = 0; j < numElts; ++j) {
3018 const GO gblColInd = colMap.getGlobalElement (inds[j]);
3019 if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3020 const size_t offset =
3021 KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3022 gblColInd, hint, sorted);
3023 if (offset != rowInfo.numEntries) {
3024 if (atomic) {
3025 Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3026 }
3027 else {
3028 rowVals[offset] += newVals[j];
3029 }
3030 hint = offset + 1;
3031 ++numValid;
3032 }
3034 }
3035 }
3036 // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3037 // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3038 // to be neither locally nor globally indexed on a process.
3039 // This means that the graph or matrix has no entries on that
3040 // process. Epetra also works like this. It's related to lazy
3041 // allocation (on first insertion, not at graph / matrix
3042 // construction). Lazy allocation will go away because it is
3043 // not thread scalable.
3044
3045 return numValid;
3046 }
3047
3048 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3049 LocalOrdinal
3051 sumIntoLocalValues (const LocalOrdinal localRow,
3052 const Teuchos::ArrayView<const LocalOrdinal>& indices,
3053 const Teuchos::ArrayView<const Scalar>& values,
3054 const bool atomic)
3055 {
3056 using LO = local_ordinal_type;
3057 const LO numInputEnt = static_cast<LO>(indices.size());
3058 if (static_cast<LO>(values.size()) != numInputEnt) {
3059 return Teuchos::OrdinalTraits<LO>::invalid();
3060 }
3061 const LO* const inputInds = indices.getRawPtr();
3062 const scalar_type* const inputVals = values.getRawPtr();
3063 return this->sumIntoLocalValues(localRow, numInputEnt,
3064 inputVals, inputInds, atomic);
3065 }
3066
3067 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3069 local_ordinal_type
3072 const local_ordinal_type localRow,
3073 const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3074 const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3075 const bool atomic)
3076 {
3077 using LO = local_ordinal_type;
3078 const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3079 if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3080 return Teuchos::OrdinalTraits<LO>::invalid();
3081 }
3082 const scalar_type* inVals =
3083 reinterpret_cast<const scalar_type*>(inputVals.data());
3084 return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3085 inputInds.data(), atomic);
3086 }
3087
3088 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3089 LocalOrdinal
3091 sumIntoLocalValues (const LocalOrdinal localRow,
3092 const LocalOrdinal numEnt,
3093 const Scalar vals[],
3094 const LocalOrdinal cols[],
3095 const bool atomic)
3096 {
3097 typedef impl_scalar_type IST;
3098 typedef LocalOrdinal LO;
3099
3100 if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3101 // Fill must be active and the "nonconst" graph must exist.
3102 return Teuchos::OrdinalTraits<LO>::invalid ();
3103 }
3104 const crs_graph_type& graph = * (this->staticGraph_);
3105 const RowInfo rowInfo = graph.getRowInfo (localRow);
3106
3107 if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3108 // The calling process does not own this row, so it is not
3109 // allowed to modify its values.
3110 return static_cast<LO> (0);
3111 }
3112 auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3113 const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3114 return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3115 cols, inputVals, numEnt, atomic);
3116 }
3117
3118 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3120 values_dualv_type::t_host::const_type
3122 getValuesViewHost (const RowInfo& rowinfo) const
3123 {
3124 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3125 return typename values_dualv_type::t_host::const_type ();
3126 else
3127 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3128 rowinfo.allocSize,
3129 Access::ReadOnly);
3130 }
3131
3132 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3134 values_dualv_type::t_host
3136 getValuesViewHostNonConst (const RowInfo& rowinfo)
3137 {
3138 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3139 return typename values_dualv_type::t_host ();
3140 else
3141 return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3142 rowinfo.allocSize,
3143 Access::ReadWrite);
3144 }
3145
3146 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3148 values_dualv_type::t_dev::const_type
3150 getValuesViewDevice (const RowInfo& rowinfo) const
3151 {
3152 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3153 return typename values_dualv_type::t_dev::const_type ();
3154 else
3155 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3156 rowinfo.allocSize,
3157 Access::ReadOnly);
3158 }
3159
3160 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3162 values_dualv_type::t_dev
3165 {
3166 if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3167 return typename values_dualv_type::t_dev ();
3168 else
3169 return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3170 rowinfo.allocSize,
3171 Access::ReadWrite);
3172 }
3173
3174
3175 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3176 void
3179 nonconst_local_inds_host_view_type &indices,
3180 nonconst_values_host_view_type &values,
3181 size_t& numEntries) const
3182 {
3183 using Teuchos::ArrayView;
3184 using Teuchos::av_reinterpret_cast;
3185 const char tfecfFuncName[] = "getLocalRowCopy: ";
3186
3187 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3188 (! this->hasColMap (), std::runtime_error,
3189 "The matrix does not have a column Map yet. This means we don't have "
3190 "local indices for columns yet, so it doesn't make sense to call this "
3191 "method. If the matrix doesn't have a column Map yet, you should call "
3192 "fillComplete on it first.");
3193
3194 const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3195 const size_t theNumEntries = rowinfo.numEntries;
3196 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3197 (static_cast<size_t> (indices.size ()) < theNumEntries ||
3198 static_cast<size_t> (values.size ()) < theNumEntries,
3199 std::runtime_error, "Row with local index " << localRow << " has " <<
3200 theNumEntries << " entry/ies, but indices.size() = " <<
3201 indices.size () << " and values.size() = " << values.size () << ".");
3202 numEntries = theNumEntries; // first side effect
3203
3204 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3205 if (staticGraph_->isLocallyIndexed ()) {
3206 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3207 auto curVals = getValuesViewHost(rowinfo);
3208
3209 for (size_t j = 0; j < theNumEntries; ++j) {
3210 values[j] = curVals[j];
3211 indices[j] = curLclInds(j);
3212 }
3213 }
3214 else if (staticGraph_->isGloballyIndexed ()) {
3215 // Don't call getColMap(), because it touches RCP's reference count.
3216 const map_type& colMap = * (staticGraph_->colMap_);
3217 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3218 auto curVals = getValuesViewHost(rowinfo);
3219
3220 for (size_t j = 0; j < theNumEntries; ++j) {
3221 values[j] = curVals[j];
3222 indices[j] = colMap.getLocalElement (curGblInds(j));
3223 }
3224 }
3225 }
3226 }
3227
3228
3229template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3230void
3233 nonconst_global_inds_host_view_type &indices,
3234 nonconst_values_host_view_type &values,
3235 size_t& numEntries) const
3236 {
3237 using Teuchos::ArrayView;
3238 using Teuchos::av_reinterpret_cast;
3239 const char tfecfFuncName[] = "getGlobalRowCopy: ";
3240
3241 const RowInfo rowinfo =
3242 staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3243 const size_t theNumEntries = rowinfo.numEntries;
3244 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3245 static_cast<size_t> (indices.size ()) < theNumEntries ||
3246 static_cast<size_t> (values.size ()) < theNumEntries,
3247 std::runtime_error, "Row with global index " << globalRow << " has "
3248 << theNumEntries << " entry/ies, but indices.size() = " <<
3249 indices.size () << " and values.size() = " << values.size () << ".");
3250 numEntries = theNumEntries; // first side effect
3251
3252 if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3253 if (staticGraph_->isLocallyIndexed ()) {
3254 const map_type& colMap = * (staticGraph_->colMap_);
3255 auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3256 auto curVals = getValuesViewHost(rowinfo);
3257
3258 for (size_t j = 0; j < theNumEntries; ++j) {
3259 values[j] = curVals[j];
3260 indices[j] = colMap.getGlobalElement (curLclInds(j));
3261 }
3262 }
3263 else if (staticGraph_->isGloballyIndexed ()) {
3264 auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3265 auto curVals = getValuesViewHost(rowinfo);
3266
3267 for (size_t j = 0; j < theNumEntries; ++j) {
3268 values[j] = curVals[j];
3269 indices[j] = curGblInds(j);
3270 }
3271 }
3272 }
3273 }
3274
3275
3276 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277 void
3279 getLocalRowView(LocalOrdinal localRow,
3280 local_inds_host_view_type &indices,
3281 values_host_view_type &values) const
3282 {
3283 const char tfecfFuncName[] = "getLocalRowView: ";
3284
3285 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3286 isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3287 "its indices as global indices, so you cannot get a view with local "
3288 "column indices. If the matrix has a column Map, you may call "
3289 "getLocalRowCopy() to get local column indices; otherwise, you may get "
3290 "a view with global column indices by calling getGlobalRowCopy().");
3291
3292 const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3293 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3294 rowInfo.numEntries > 0) {
3295 indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3296 rowInfo.offset1D,
3297 rowInfo.numEntries,
3298 Access::ReadOnly);
3299 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3300 rowInfo.numEntries,
3301 Access::ReadOnly);
3302 }
3303 else {
3304 // This does the right thing (reports an empty row) if the input
3305 // row is invalid.
3306 indices = local_inds_host_view_type();
3307 values = values_host_view_type();
3308 }
3309
3310#ifdef HAVE_TPETRA_DEBUG
3311 const char suffix[] = ". This should never happen. Please report this "
3312 "bug to the Tpetra developers.";
3313 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3314 (static_cast<size_t> (indices.size ()) !=
3315 static_cast<size_t> (values.size ()), std::logic_error,
3316 "At the end of this method, for local row " << localRow << ", "
3317 "indices.size() = " << indices.size () << " != values.size () = "
3318 << values.size () << suffix);
3319 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3320 (static_cast<size_t> (indices.size ()) !=
3321 static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3322 "At the end of this method, for local row " << localRow << ", "
3323 "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3324 << rowInfo.numEntries << suffix);
3325 const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3326 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3327 (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3328 "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3329 << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3330 expectedNumEntries << suffix);
3331#endif // HAVE_TPETRA_DEBUG
3332 }
3333
3334
3335 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3336 void
3338 getGlobalRowView (GlobalOrdinal globalRow,
3339 global_inds_host_view_type &indices,
3340 values_host_view_type &values) const
3341 {
3342 const char tfecfFuncName[] = "getGlobalRowView: ";
3343
3344 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3345 isLocallyIndexed (), std::runtime_error,
3346 "The matrix is locally indexed, so we cannot return a view of the row "
3347 "with global column indices. Use getGlobalRowCopy() instead.");
3348
3349 // This does the right thing (reports an empty row) if the input
3350 // row is invalid.
3351 const RowInfo rowInfo =
3352 staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3353 if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3354 rowInfo.numEntries > 0) {
3355 indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3356 rowInfo.numEntries,
3357 Access::ReadOnly);
3358 values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3359 rowInfo.numEntries,
3360 Access::ReadOnly);
3361 }
3362 else {
3363 indices = global_inds_host_view_type();
3364 values = values_host_view_type();
3365 }
3366
3367#ifdef HAVE_TPETRA_DEBUG
3368 const char suffix[] = ". This should never happen. Please report this "
3369 "bug to the Tpetra developers.";
3370 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3371 (static_cast<size_t> (indices.size ()) !=
3372 static_cast<size_t> (values.size ()), std::logic_error,
3373 "At the end of this method, for global row " << globalRow << ", "
3374 "indices.size() = " << indices.size () << " != values.size () = "
3375 << values.size () << suffix);
3376 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3377 (static_cast<size_t> (indices.size ()) !=
3378 static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3379 "At the end of this method, for global row " << globalRow << ", "
3380 "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3381 << rowInfo.numEntries << suffix);
3382 const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3383 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3384 (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3385 "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3386 "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3387 " " << expectedNumEntries << suffix);
3388#endif // HAVE_TPETRA_DEBUG
3389 }
3390
3391
3392 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3393 void
3395 scale (const Scalar& alpha)
3396 {
3397 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3398
3399 const size_t nlrs = staticGraph_->getLocalNumRows ();
3400 const size_t numEntries = staticGraph_->getLocalNumEntries ();
3401 if (! staticGraph_->indicesAreAllocated () ||
3402 nlrs == 0 || numEntries == 0) {
3403 // do nothing
3404 }
3405 else {
3406
3407 auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3408 KokkosBlas::scal(vals, theAlpha, vals);
3409
3410 }
3411 }
3412
3413 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3414 void
3416 setAllToScalar (const Scalar& alpha)
3417 {
3418 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3419
3420 // replace all values in the matrix
3421 // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3422 // however, if there are no valid entries, we can short-circuit
3423 // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3424 const size_t numEntries = staticGraph_->getLocalNumEntries();
3425 if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3426 // do nothing
3427 }
3428 else {
3429 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3430 Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3431 theAlpha);
3432 // CAG: This fence was found to be required on Cuda with UVM=on.
3433 Kokkos::fence();
3434 }
3435 }
3436
3437 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3438 void
3440 setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3441 const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3442 const typename local_matrix_device_type::values_type& values)
3443 {
3444 using ProfilingRegion=Details::ProfilingRegion;
3445 ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3446 const char tfecfFuncName[] = "setAllValues: ";
3447 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3448 (columnIndices.size () != values.size (), std::invalid_argument,
3449 "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3450 " = " << values.size () << ".");
3451 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3452 (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3453
3454 try {
3455 myGraph_->setAllIndices (rowPointers, columnIndices);
3456 }
3457 catch (std::exception &e) {
3458 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3459 (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3460 "exception: " << e.what ());
3461 }
3462
3463 // Make sure that myGraph_ now has a local graph. It may not be
3464 // fillComplete yet, so it's important to check. We don't care
3465 // whether setAllIndices() did a shallow copy or a deep copy, so a
3466 // good way to check is to compare dimensions.
3467 auto lclGraph = myGraph_->getLocalGraphDevice ();
3468 const size_t numEnt = lclGraph.entries.extent (0);
3469 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3470 (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3471 numEnt != static_cast<size_t> (columnIndices.extent (0)),
3472 std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3473 "local graph. Please report this bug to the Tpetra developers.");
3474
3475 valuesPacked_wdv = values_wdv_type(values);
3476 valuesUnpacked_wdv = valuesPacked_wdv;
3477
3478 // Storage MUST be packed, since the interface doesn't give any
3479 // way to indicate any extra space at the end of each row.
3480 this->storageStatus_ = Details::STORAGE_1D_PACKED;
3481
3482 checkInternalState ();
3483 }
3484
3485 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3486 void
3488 setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3489 {
3490 using ProfilingRegion=Details::ProfilingRegion;
3491 ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3492
3493 auto graph = localDeviceMatrix.graph;
3494 //FIXME how to check whether graph is allocated
3495
3496 auto rows = graph.row_map;
3497 auto columns = graph.entries;
3498 auto values = localDeviceMatrix.values;
3499
3500 setAllValues(rows,columns,values);
3501 }
3502
3503 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3504 void
3506 setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3507 const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3508 const Teuchos::ArrayRCP<Scalar>& val)
3509 {
3510 using Kokkos::Compat::getKokkosViewDeepCopy;
3511 using Teuchos::ArrayRCP;
3512 using Teuchos::av_reinterpret_cast;
3513 typedef device_type DT;
3514 typedef impl_scalar_type IST;
3515 typedef typename local_graph_device_type::row_map_type row_map_type;
3516 //typedef typename row_map_type::non_const_value_type row_offset_type;
3517 const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3518
3519 // The row offset type may depend on the execution space. It may
3520 // not necessarily be size_t. If it's not, we need to make a deep
3521 // copy. We need to make a deep copy anyway so that Kokkos can
3522 // own the memory. Regardless, ptrIn gets the copy.
3523 typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3524 Kokkos::View<const size_t*,
3525 typename row_map_type::array_layout,
3526 Kokkos::HostSpace,
3527 Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3528 ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3529
3530 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3531 (ptrNative.extent (0) != ptrSizeT.extent (0),
3532 std::logic_error, "ptrNative.extent(0) = " <<
3533 ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3534 << ptrSizeT.extent (0) << ". Please report this bug to the "
3535 "Tpetra developers.");
3536
3537 auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3538 auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3539 this->setAllValues (ptrNative, indIn, valIn);
3540 }
3541
3542 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3543 void
3545 getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3546 {
3547 const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3548 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3549 (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3550
3551 // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3552 // this method in CrsGraph too, so don't call it (otherwise build
3553 // warnings will show up and annoy users). Instead, copy results
3554 // in and out, if the memory space requires it.
3555
3556 const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3557 if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3558 offsets.resize (lclNumRows);
3559 }
3560
3561 // The input ArrayRCP must always be a host pointer. Thus, if
3562 // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3563 // to write to that allocation directly as a Kokkos::View.
3564 if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3565 // It is always syntactically correct to assign a raw host
3566 // pointer to a device View, so this code will compile correctly
3567 // even if this branch never runs.
3568 typedef Kokkos::View<size_t*, device_type,
3569 Kokkos::MemoryUnmanaged> output_type;
3570 output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3571 staticGraph_->getLocalDiagOffsets (offsetsOut);
3572 }
3573 else {
3574 Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3575 staticGraph_->getLocalDiagOffsets (offsetsTmp);
3576 typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3577 Kokkos::MemoryUnmanaged> output_type;
3578 output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3579 // DEEP_COPY REVIEW - DEVICE-TO-HOST
3580 Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3581 }
3582 }
3583
3584 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3585 void
3588 {
3589 using Teuchos::ArrayRCP;
3590 using Teuchos::ArrayView;
3591 using Teuchos::av_reinterpret_cast;
3592 const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3593 typedef local_ordinal_type LO;
3594
3595
3596 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3597 staticGraph_.is_null (), std::runtime_error,
3598 "This method requires that the matrix have a graph.");
3599 auto rowMapPtr = this->getRowMap ();
3600 if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3601 // Processes on which the row Map or its communicator is null
3602 // don't participate. Users shouldn't even call this method on
3603 // those processes.
3604 return;
3605 }
3606 auto colMapPtr = this->getColMap ();
3607 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3608 (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3609 "This method requires that the matrix have a column Map.");
3610 const map_type& rowMap = * rowMapPtr;
3611 const map_type& colMap = * colMapPtr;
3612 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3613
3614#ifdef HAVE_TPETRA_DEBUG
3615 // isCompatible() requires an all-reduce, and thus this check
3616 // should only be done in debug mode.
3617 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3618 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3619 "The input Vector's Map must be compatible with the CrsMatrix's row "
3620 "Map. You may check this by using Map's isCompatible method: "
3621 "diag.getMap ()->isCompatible (A.getRowMap ());");
3622#endif // HAVE_TPETRA_DEBUG
3623
3624 if (this->isFillComplete ()) {
3625 const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3626 // 1-D subview of the first (and only) column of D_lcl.
3627 const auto D_lcl_1d =
3628 Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3629
3630 const auto lclRowMap = rowMap.getLocalMap ();
3631 const auto lclColMap = colMap.getLocalMap ();
3632 using ::Tpetra::Details::getDiagCopyWithoutOffsets;
3633 (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3634 lclColMap,
3635 getLocalMatrixDevice ());
3636 }
3637 else {
3638 using ::Tpetra::Details::getLocalDiagCopyWithoutOffsetsNotFillComplete;
3639 (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3640 }
3641 }
3642
3643 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3644 void
3647 const Kokkos::View<const size_t*, device_type,
3648 Kokkos::MemoryUnmanaged>& offsets) const
3649 {
3650 typedef LocalOrdinal LO;
3651
3652#ifdef HAVE_TPETRA_DEBUG
3653 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3654 const map_type& rowMap = * (this->getRowMap ());
3655 // isCompatible() requires an all-reduce, and thus this check
3656 // should only be done in debug mode.
3657 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3658 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3659 "The input Vector's Map must be compatible with (in the sense of Map::"
3660 "isCompatible) the CrsMatrix's row Map.");
3661#endif // HAVE_TPETRA_DEBUG
3662
3663 // For now, we fill the Vector on the host and sync to device.
3664 // Later, we may write a parallel kernel that works entirely on
3665 // device.
3666 //
3667 // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3668 // we write a device kernel, it will not need to assume UVM.
3669
3670 auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3671 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3672 // Get 1-D subview of the first (and only) column of D_lcl.
3673 auto D_lcl_1d =
3674 Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3675
3676 KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3677 getLocalMatrixDevice ());
3678 }
3679
3680 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3681 void
3684 const Teuchos::ArrayView<const size_t>& offsets) const
3685 {
3686 using LO = LocalOrdinal;
3687 using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3688 using IST = impl_scalar_type;
3689
3690#ifdef HAVE_TPETRA_DEBUG
3691 const char tfecfFuncName[] = "getLocalDiagCopy: ";
3692 const map_type& rowMap = * (this->getRowMap ());
3693 // isCompatible() requires an all-reduce, and thus this check
3694 // should only be done in debug mode.
3695 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3696 ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3697 "The input Vector's Map must be compatible with (in the sense of Map::"
3698 "isCompatible) the CrsMatrix's row Map.");
3699#endif // HAVE_TPETRA_DEBUG
3700
3701 // See #1510. In case diag has already been marked modified on
3702 // device, we need to clear that flag, since the code below works
3703 // on host.
3704 //diag.clear_sync_state ();
3705
3706 // For now, we fill the Vector on the host and sync to device.
3707 // Later, we may write a parallel kernel that works entirely on
3708 // device.
3709 auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3710 // 1-D subview of the first (and only) column of lclVecHost.
3711 auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3712
3713 using host_offsets_view_type =
3714 Kokkos::View<const size_t*, Kokkos::HostSpace,
3715 Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3716 host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3717 // Find the diagonal entries and put them in lclVecHost1d.
3718 using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3719 const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3720 const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3721
3722 auto rowPtrsPackedHost = staticGraph_->rowPtrsPacked_host_;
3723 auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3724 Kokkos::parallel_for
3725 ("Tpetra::CrsMatrix::getLocalDiagCopy",
3726 range_type (0, myNumRows),
3727 [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3728 lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3729 if (h_offsets[lclRow] != INV) {
3730 auto curRowOffset = rowPtrsPackedHost (lclRow);
3731 lclVecHost1d(lclRow) =
3732 static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3733 }
3734 });
3735 //diag.sync_device ();
3736 }
3737
3738
3739 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3740 void
3743 {
3744 using ::Tpetra::Details::ProfilingRegion;
3745 using Teuchos::ArrayRCP;
3746 using Teuchos::ArrayView;
3747 using Teuchos::null;
3748 using Teuchos::RCP;
3749 using Teuchos::rcp;
3750 using Teuchos::rcpFromRef;
3752 const char tfecfFuncName[] = "leftScale: ";
3753
3754 ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3755
3756 RCP<const vec_type> xp;
3757 if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3758 // Take from Epetra: If we have a non-trivial exporter, we must
3759 // import elements that are permuted or are on other processors.
3760 auto exporter = this->getCrsGraphRef ().getExporter ();
3761 if (exporter.get () != nullptr) {
3762 RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3763 tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3764 xp = tempVec;
3765 }
3766 else {
3767 xp = rcpFromRef (x);
3768 }
3769 }
3770 else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3771 xp = rcpFromRef (x);
3772 }
3773 else {
3774 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3775 (true, std::invalid_argument, "x's Map must be the same as "
3776 "either the row Map or the range Map of the CrsMatrix.");
3778
3779 if (this->isFillComplete()) {
3780 auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3781 auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3782 using ::Tpetra::Details::leftScaleLocalCrsMatrix;
3783 leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3784 x_lcl_1d, false, false);
3785 }
3786 else {
3787 // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3788 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3789 (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3790 " fillComplete");
3791 }
3792 }
3793
3794 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3795 void
3798 {
3799 using ::Tpetra::Details::ProfilingRegion;
3800 using Teuchos::ArrayRCP;
3801 using Teuchos::ArrayView;
3802 using Teuchos::null;
3803 using Teuchos::RCP;
3804 using Teuchos::rcp;
3805 using Teuchos::rcpFromRef;
3807 const char tfecfFuncName[] = "rightScale: ";
3808
3809 ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3810
3811 RCP<const vec_type> xp;
3812 if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3813 // Take from Epetra: If we have a non-trivial exporter, we must
3814 // import elements that are permuted or are on other processors.
3815 auto importer = this->getCrsGraphRef ().getImporter ();
3816 if (importer.get () != nullptr) {
3817 RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3818 tempVec->doImport (x, *importer, REPLACE);
3819 xp = tempVec;
3820 }
3821 else {
3822 xp = rcpFromRef (x);
3823 }
3824 }
3825 else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3826 xp = rcpFromRef (x);
3827 } else {
3828 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3829 (true, std::runtime_error, "x's Map must be the same as "
3830 "either the domain Map or the column Map of the CrsMatrix.");
3831 }
3832
3833 if (this->isFillComplete()) {
3834 auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3835 auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3836 using ::Tpetra::Details::rightScaleLocalCrsMatrix;
3837 rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3838 x_lcl_1d, false, false);
3839 }
3840 else {
3841 // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3842 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3843 (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3844 " fillComplete");
3845 }
3846 }
3847
3848 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3849 typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mag_type
3851 getFrobeniusNorm () const
3852 {
3853 using Teuchos::ArrayView;
3854 using Teuchos::outArg;
3855 using Teuchos::REDUCE_SUM;
3856 using Teuchos::reduceAll;
3857
3858 // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3859 // local part of this computation. It could make sense to put
3860 // this operation in the Kokkos::CrsMatrix.
3861
3862 // check the cache first
3863 mag_type mySum = STM::zero ();
3864 if (getLocalNumEntries() > 0) {
3865 if (isStorageOptimized ()) {
3866 // "Optimized" storage is packed storage. That means we can
3867 // iterate in one pass through the 1-D values array.
3868 const size_t numEntries = getLocalNumEntries ();
3869 auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3870 for (size_t k = 0; k < numEntries; ++k) {
3871 auto val = values[k];
3872 // Note (etp 06 Jan 2015) We need abs() here for composite types
3873 // (in general, if mag_type is on the left-hand-side, we need
3874 // abs() on the right-hand-side)
3875 const mag_type val_abs = STS::abs (val);
3876 mySum += val_abs * val_abs;
3877 }
3878 }
3879 else {
3880 const LocalOrdinal numRows =
3881 static_cast<LocalOrdinal> (this->getLocalNumRows ());
3882 for (LocalOrdinal r = 0; r < numRows; ++r) {
3883 const RowInfo rowInfo = myGraph_->getRowInfo (r);
3884 const size_t numEntries = rowInfo.numEntries;
3885 auto A_r = this->getValuesViewHost(rowInfo);
3886 for (size_t k = 0; k < numEntries; ++k) {
3887 const impl_scalar_type val = A_r[k];
3888 const mag_type val_abs = STS::abs (val);
3889 mySum += val_abs * val_abs;
3890 }
3891 }
3893 }
3894 mag_type totalSum = STM::zero ();
3895 reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3896 mySum, outArg (totalSum));
3897 return STM::sqrt (totalSum);
3898 }
3899
3900 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3901 void
3903 replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3904 {
3905 const char tfecfFuncName[] = "replaceColMap: ";
3906 // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3907 // Then replacing the column Map might mean that we need to
3908 // reindex the column indices.
3909 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3910 myGraph_.is_null (), std::runtime_error,
3911 "This method does not work if the matrix has a const graph. The whole "
3912 "idea of a const graph is that you are not allowed to change it, but "
3913 "this method necessarily must modify the graph, since the graph owns "
3914 "the matrix's column Map.");
3915 myGraph_->replaceColMap (newColMap);
3916 }
3917
3918 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3919 void
3921 reindexColumns (crs_graph_type* const graph,
3922 const Teuchos::RCP<const map_type>& newColMap,
3923 const Teuchos::RCP<const import_type>& newImport,
3924 const bool sortEachRow)
3926 const char tfecfFuncName[] = "reindexColumns: ";
3927 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3928 graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3929 "The input graph is null, but the matrix does not own its graph.");
3930
3931 crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3932 const bool sortGraph = false; // we'll sort graph & matrix together below
3933
3934 theGraph.reindexColumns (newColMap, newImport, sortGraph);
3935
3936 if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3937 const LocalOrdinal lclNumRows =
3938 static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3939
3940 for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3941
3942 const RowInfo rowInfo = theGraph.getRowInfo (row);
3943 auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3944 auto vals = this->getValuesViewHostNonConst (rowInfo);
3945
3946 sort2 (lclColInds.data (),
3947 lclColInds.data () + rowInfo.numEntries,
3948 vals.data ());
3949 }
3950 theGraph.indicesAreSorted_ = true;
3951 }
3952 }
3953
3954 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3955 void
3957 replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3958 {
3959 const char tfecfFuncName[] = "replaceDomainMap: ";
3960 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3961 myGraph_.is_null (), std::runtime_error,
3962 "This method does not work if the matrix has a const graph. The whole "
3963 "idea of a const graph is that you are not allowed to change it, but this"
3964 " method necessarily must modify the graph, since the graph owns the "
3965 "matrix's domain Map and Import objects.");
3966 myGraph_->replaceDomainMap (newDomainMap);
3967 }
3968
3969 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3970 void
3972 replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3973 Teuchos::RCP<const import_type>& newImporter)
3974 {
3975 const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3976 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3977 myGraph_.is_null (), std::runtime_error,
3978 "This method does not work if the matrix has a const graph. The whole "
3979 "idea of a const graph is that you are not allowed to change it, but this"
3980 " method necessarily must modify the graph, since the graph owns the "
3981 "matrix's domain Map and Import objects.");
3982 myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3983 }
3984
3985 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3986 void
3988 replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3989 {
3990 const char tfecfFuncName[] = "replaceRangeMap: ";
3991 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3992 myGraph_.is_null (), std::runtime_error,
3993 "This method does not work if the matrix has a const graph. The whole "
3994 "idea of a const graph is that you are not allowed to change it, but this"
3995 " method necessarily must modify the graph, since the graph owns the "
3996 "matrix's domain Map and Import objects.");
3997 myGraph_->replaceRangeMap (newRangeMap);
3998 }
3999
4000 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4001 void
4003 replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
4004 Teuchos::RCP<const export_type>& newExporter)
4005 {
4006 const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
4007 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4008 myGraph_.is_null (), std::runtime_error,
4009 "This method does not work if the matrix has a const graph. The whole "
4010 "idea of a const graph is that you are not allowed to change it, but this"
4011 " method necessarily must modify the graph, since the graph owns the "
4012 "matrix's domain Map and Import objects.");
4013 myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
4014 }
4015
4016 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4017 void
4019 insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4020 const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4021 const Teuchos::ArrayView<const Scalar>& values)
4022 {
4023 using Teuchos::Array;
4024 typedef GlobalOrdinal GO;
4025 typedef typename Array<GO>::size_type size_type;
4026
4027 const size_type numToInsert = indices.size ();
4028 // Add the new data to the list of nonlocals.
4029 // This creates the arrays if they don't exist yet.
4030 std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4031 Array<GO>& curRowInds = curRow.first;
4032 Array<Scalar>& curRowVals = curRow.second;
4033 const size_type newCapacity = curRowInds.size () + numToInsert;
4034 curRowInds.reserve (newCapacity);
4035 curRowVals.reserve (newCapacity);
4036 for (size_type k = 0; k < numToInsert; ++k) {
4037 curRowInds.push_back (indices[k]);
4038 curRowVals.push_back (values[k]);
4039 }
4040 }
4041
4042 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4043 void
4046 {
4047 using Details::Behavior;
4049 using Teuchos::Comm;
4050 using Teuchos::outArg;
4051 using Teuchos::RCP;
4052 using Teuchos::rcp;
4053 using Teuchos::REDUCE_MAX;
4054 using Teuchos::REDUCE_MIN;
4055 using Teuchos::reduceAll;
4056 using std::endl;
4058 //typedef LocalOrdinal LO;
4059 typedef GlobalOrdinal GO;
4060 typedef typename Teuchos::Array<GO>::size_type size_type;
4061 const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4062 ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4063
4064 const bool verbose = Behavior::verbose("CrsMatrix");
4065 std::unique_ptr<std::string> prefix;
4066 if (verbose) {
4067 prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4068 std::ostringstream os;
4069 os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4070 << endl;
4071 std::cerr << os.str();
4072 }
4073 RCP<const Comm<int> > comm = getComm ();
4074
4075 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4076 (! isFillActive (), std::runtime_error, "Fill must be active before "
4077 "you may call this method.");
4078
4079 const size_t myNumNonlocalRows = nonlocals_.size ();
4080
4081 // If no processes have nonlocal rows, then we don't have to do
4082 // anything. Checking this is probably cheaper than constructing
4083 // the Map of nonlocal rows (see below) and noticing that it has
4084 // zero global entries.
4085 {
4086 const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4087 int someoneHasNonlocalRows = 0;
4088 reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4089 outArg (someoneHasNonlocalRows));
4090 if (someoneHasNonlocalRows == 0) {
4091 return; // no process has nonlocal rows, so nothing to do
4092 }
4093 }
4094
4095 // 1. Create a list of the "nonlocal" rows on each process. this
4096 // requires iterating over nonlocals_, so while we do this,
4097 // deduplicate the entries and get a count for each nonlocal
4098 // row on this process.
4099 // 2. Construct a new row Map corresponding to those rows. This
4100 // Map is likely overlapping. We know that the Map is not
4101 // empty on all processes, because the above all-reduce and
4102 // return exclude that case.
4103
4104 RCP<const map_type> nonlocalRowMap;
4105 Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4106 {
4107 Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4108 size_type curPos = 0;
4109 for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4110 ++mapIter, ++curPos) {
4111 myNonlocalGblRows[curPos] = mapIter->first;
4112 // Get the values and column indices by reference, since we
4113 // intend to change them in place (that's what "erase" does).
4114 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4115 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4116
4117 // Sort both arrays jointly, using the column indices as keys,
4118 // then merge them jointly. "Merge" here adds values
4119 // corresponding to the same column indices. The first 2 args
4120 // of merge2 are output arguments that work just like the
4121 // return value of std::unique.
4122 sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4123 typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4124 typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4125 merge2 (gblCols_newEnd, vals_newEnd,
4126 gblCols.begin (), gblCols.end (),
4127 vals.begin (), vals.end ());
4128 gblCols.erase (gblCols_newEnd, gblCols.end ());
4129 vals.erase (vals_newEnd, vals.end ());
4130 numEntPerNonlocalRow[curPos] = gblCols.size ();
4131 }
4132
4133 // Currently, Map requires that its indexBase be the global min
4134 // of all its global indices. Map won't compute this for us, so
4135 // we must do it. If our process has no nonlocal rows, set the
4136 // "min" to the max possible GO value. This ensures that if
4137 // some process has at least one nonlocal row, then it will pick
4138 // that up as the min. We know that at least one process has a
4139 // nonlocal row, since the all-reduce and return at the top of
4140 // this method excluded that case.
4141 GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4142 {
4143 auto iter = std::min_element (myNonlocalGblRows.begin (),
4144 myNonlocalGblRows.end ());
4145 if (iter != myNonlocalGblRows.end ()) {
4146 myMinNonlocalGblRow = *iter;
4147 }
4148 }
4149 GO gblMinNonlocalGblRow = 0;
4150 reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4151 outArg (gblMinNonlocalGblRow));
4152 const GO indexBase = gblMinNonlocalGblRow;
4153 const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4154 nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4155 }
4156
4157 // 3. Use the values and column indices for each nonlocal row, as
4158 // stored in nonlocals_, to construct a CrsMatrix corresponding
4159 // to nonlocal rows. We have
4160 // exact counts of the number of entries in each nonlocal row.
4161
4162 if (verbose) {
4163 std::ostringstream os;
4164 os << *prefix << "Create nonlocal matrix" << endl;
4165 std::cerr << os.str();
4166 }
4167 RCP<crs_matrix_type> nonlocalMatrix =
4168 rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4169 {
4170 size_type curPos = 0;
4171 for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4172 ++mapIter, ++curPos) {
4173 const GO gblRow = mapIter->first;
4174 // Get values & column indices by ref, just to avoid copy.
4175 Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4176 Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4177 //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4178 nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4179 }
4180 }
4181 // There's no need to fill-complete the nonlocals matrix.
4182 // We just use it as a temporary container for the Export.
4183
4184 // 4. If the original row Map is one to one, then we can Export
4185 // directly from nonlocalMatrix into this. Otherwise, we have
4186 // to create a temporary matrix with a one-to-one row Map,
4187 // Export into that, then Import from the temporary matrix into
4188 // *this.
4189
4190 auto origRowMap = this->getRowMap ();
4191 const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4192
4193 int isLocallyComplete = 1; // true by default
4194
4195 if (origRowMapIsOneToOne) {
4196 if (verbose) {
4197 std::ostringstream os;
4198 os << *prefix << "Original row Map is 1-to-1" << endl;
4199 std::cerr << os.str();
4200 }
4201 export_type exportToOrig (nonlocalRowMap, origRowMap);
4202 if (! exportToOrig.isLocallyComplete ()) {
4203 isLocallyComplete = 0;
4204 }
4205 if (verbose) {
4206 std::ostringstream os;
4207 os << *prefix << "doExport from nonlocalMatrix" << endl;
4208 std::cerr << os.str();
4209 }
4210 this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4211 // We're done at this point!
4212 }
4213 else {
4214 if (verbose) {
4215 std::ostringstream os;
4216 os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4217 std::cerr << os.str();
4218 }
4219 // If you ask a Map whether it is one to one, it does some
4220 // communication and stashes intermediate results for later use
4221 // by createOneToOne. Thus, calling createOneToOne doesn't cost
4222 // much more then the original cost of calling isOneToOne.
4223 auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4224 export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4225 if (! exportToOneToOne.isLocallyComplete ()) {
4226 isLocallyComplete = 0;
4227 }
4228
4229 // Create a temporary matrix with the one-to-one row Map.
4230 //
4231 // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4232 // each row, to avoid reallocation during the Export operation.
4233 if (verbose) {
4234 std::ostringstream os;
4235 os << *prefix << "Create & doExport into 1-to-1 matrix"
4236 << endl;
4237 std::cerr << os.str();
4238 }
4239 crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4240 // Export from matrix of nonlocals into the temp one-to-one matrix.
4241 oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4242 Tpetra::ADD);
4243
4244 // We don't need the matrix of nonlocals anymore, so get rid of
4245 // it, to keep the memory high-water mark down.
4246 if (verbose) {
4247 std::ostringstream os;
4248 os << *prefix << "Free nonlocalMatrix" << endl;
4249 std::cerr << os.str();
4250 }
4251 nonlocalMatrix = Teuchos::null;
4252
4253 // Import from the one-to-one matrix to the original matrix.
4254 if (verbose) {
4255 std::ostringstream os;
4256 os << *prefix << "doImport from 1-to-1 matrix" << endl;
4257 std::cerr << os.str();
4258 }
4259 import_type importToOrig (oneToOneRowMap, origRowMap);
4260 this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4261 }
4262
4263 // It's safe now to clear out nonlocals_, since we've already
4264 // committed side effects to *this. The standard idiom for
4265 // clearing a Container like std::map, is to swap it with an empty
4266 // Container and let the swapped Container fall out of scope.
4267 if (verbose) {
4268 std::ostringstream os;
4269 os << *prefix << "Free nonlocals_ (std::map)" << endl;
4270 std::cerr << os.str();
4271 }
4272 decltype (nonlocals_) newNonlocals;
4273 std::swap (nonlocals_, newNonlocals);
4274
4275 // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4276 // don't like throwing an exception here. A local return value
4277 // would likely be more useful to users. However, if users find
4278 // themselves exercising nonlocal inserts often, then they are
4279 // probably novice users who need the help. See Gibhub Issues
4280 // #603 and #601 (esp. the latter) for discussion.
4281
4282 int isGloballyComplete = 0; // output argument of reduceAll
4283 reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4284 outArg (isGloballyComplete));
4285 TEUCHOS_TEST_FOR_EXCEPTION
4286 (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4287 "you called insertGlobalValues with a global row index which is not in "
4288 "the matrix's row Map on any process in its communicator.");
4289 }
4290
4291 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4292 void
4294 resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4295 {
4296 if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4297 myGraph_->resumeFill (params);
4298 }
4299 fillComplete_ = false;
4300 }
4301
4302 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4303 bool
4305 haveGlobalConstants() const {
4306 return getCrsGraphRef ().haveGlobalConstants ();
4307 }
4308
4309 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4310 void
4312 fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4313 {
4314 const char tfecfFuncName[] = "fillComplete(params): ";
4315
4316 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4317 (this->getCrsGraph ().is_null (), std::logic_error,
4318 "getCrsGraph() returns null. This should not happen at this point. "
4319 "Please report this bug to the Tpetra developers.");
4320
4321 const crs_graph_type& graph = this->getCrsGraphRef ();
4322 if (this->isStaticGraph () && graph.isFillComplete ()) {
4323 // If this matrix's graph is fill complete and the user did not
4324 // supply a domain or range Map, use the graph's domain and
4325 // range Maps.
4326 this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4327 }
4328 else { // assume that user's row Map is the domain and range Map
4329 Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4330 Teuchos::RCP<const map_type> domainMap = rangeMap;
4331 this->fillComplete (domainMap, rangeMap, params);
4332 }
4333 }
4334
4335 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4336 void
4338 fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4339 const Teuchos::RCP<const map_type>& rangeMap,
4340 const Teuchos::RCP<Teuchos::ParameterList>& params)
4341 {
4342 using Details::Behavior;
4344 using Teuchos::ArrayRCP;
4345 using Teuchos::RCP;
4346 using Teuchos::rcp;
4347 using std::endl;
4348 const char tfecfFuncName[] = "fillComplete: ";
4349 ProfilingRegion regionFillComplete
4350 ("Tpetra::CrsMatrix::fillComplete");
4351 const bool verbose = Behavior::verbose("CrsMatrix");
4352 std::unique_ptr<std::string> prefix;
4353 if (verbose) {
4354 prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4355 std::ostringstream os;
4356 os << *prefix << endl;
4357 std::cerr << os.str ();
4358 }
4360 "Tpetra::CrsMatrix::fillCompete",
4361 "fillCompete");
4362
4363 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4364 (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4365 "Matrix fill state must be active (isFillActive() "
4366 "must be true) before you may call fillComplete().");
4367 const int numProcs = this->getComm ()->getSize ();
4368
4369 //
4370 // Read parameters from the input ParameterList.
4371 //
4372 {
4373 Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4374
4375 // If true, the caller promises that no process did nonlocal
4376 // changes since the last call to fillComplete.
4377 bool assertNoNonlocalInserts = false;
4378 // If true, makeColMap sorts remote GIDs (within each remote
4379 // process' group).
4380 bool sortGhosts = true;
4381
4382 if (! params.is_null ()) {
4383 assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4384 assertNoNonlocalInserts);
4385 if (params->isParameter ("sort column map ghost gids")) {
4386 sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4387 }
4388 else if (params->isParameter ("Sort column Map ghost GIDs")) {
4389 sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4390 }
4391 }
4392 // We also don't need to do global assembly if there is only one
4393 // process in the communicator.
4394 const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4395 // This parameter only matters if this matrix owns its graph.
4396 if (! this->myGraph_.is_null ()) {
4397 this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4398 }
4399
4400 if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4401 if (this->hasColMap ()) { // use local indices
4402 allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4403 }
4404 else { // no column Map, so use global indices
4405 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4406 }
4407 }
4408 // Global assemble, if we need to. This call only costs a single
4409 // all-reduce if we didn't need global assembly after all.
4410 if (needGlobalAssemble) {
4411 this->globalAssemble ();
4412 }
4413 else {
4414 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4415 (numProcs == 1 && nonlocals_.size() > 0,
4416 std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4417 "An invalid entry (i.e., with row index not in the row Map) must have "
4418 "been submitted to the CrsMatrix.");
4419 }
4420 }
4421 if (this->isStaticGraph ()) {
4422 Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4423 // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4424 // checks below only in debug mode. It would be nicer to do a
4425 // local check, then propagate the error state in a deferred
4426 // way, whenever communication happens. That would reduce the
4427 // cost of checking, to the point where it may make sense to
4428 // enable it even in release mode.
4429#ifdef HAVE_TPETRA_DEBUG
4430 // FIXME (mfh 18 Jun 2014) This check for correctness of the
4431 // input Maps incurs a penalty of two all-reduces for the
4432 // otherwise optimal const graph case.
4433 //
4434 // We could turn these (max) 2 all-reduces into (max) 1, by
4435 // fusing them. We could do this by adding a "locallySameAs"
4436 // method to Map, which would return one of four states:
4437 //
4438 // a. Certainly globally the same
4439 // b. Certainly globally not the same
4440 // c. Locally the same
4441 // d. Locally not the same
4442 //
4443 // The first two states don't require further communication.
4444 // The latter two states require an all-reduce to communicate
4445 // globally, but we only need one all-reduce, since we only need
4446 // to check whether at least one of the Maps is wrong.
4447 const bool domainMapsMatch =
4448 this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4449 const bool rangeMapsMatch =
4450 this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4451
4452 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4453 (! domainMapsMatch, std::runtime_error,
4454 "The CrsMatrix's domain Map does not match the graph's domain Map. "
4455 "The graph cannot be changed because it was given to the CrsMatrix "
4456 "constructor as const. You can fix this by passing in the graph's "
4457 "domain Map and range Map to the matrix's fillComplete call.");
4458
4459 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4460 (! rangeMapsMatch, std::runtime_error,
4461 "The CrsMatrix's range Map does not match the graph's range Map. "
4462 "The graph cannot be changed because it was given to the CrsMatrix "
4463 "constructor as const. You can fix this by passing in the graph's "
4464 "domain Map and range Map to the matrix's fillComplete call.");
4465#endif // HAVE_TPETRA_DEBUG
4466
4467 // The matrix does _not_ own the graph, and the graph's
4468 // structure is already fixed, so just fill the local matrix.
4469 this->fillLocalMatrix (params);
4470 }
4471 else {
4472 Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4473 // Set the graph's domain and range Maps. This will clear the
4474 // Import if the domain Map has changed (is a different
4475 // pointer), and the Export if the range Map has changed (is a
4476 // different pointer).
4477 this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4478
4479 // Make the graph's column Map, if necessary.
4480 Teuchos::Array<int> remotePIDs (0);
4481 const bool mustBuildColMap = ! this->hasColMap ();
4482 if (mustBuildColMap) {
4483 this->myGraph_->makeColMap (remotePIDs);
4484 }
4485
4486 // Make indices local, if necessary. The method won't do
4487 // anything if the graph is already locally indexed.
4488 const std::pair<size_t, std::string> makeIndicesLocalResult =
4489 this->myGraph_->makeIndicesLocal(verbose);
4490 // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4491 // the error state to makeImportExport
4492 // which may do all-reduces and thus may
4493 // have the opportunity to communicate that error state.
4494 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4495 (makeIndicesLocalResult.first != 0, std::runtime_error,
4496 makeIndicesLocalResult.second);
4497
4498 const bool sorted = this->myGraph_->isSorted ();
4499 const bool merged = this->myGraph_->isMerged ();
4500 this->sortAndMergeIndicesAndValues (sorted, merged);
4501
4502 // Make Import and Export objects, if they haven't been made
4503 // already. If we made a column Map above, reuse information
4504 // from that process to avoid communiation in the Import setup.
4505 this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4506
4507 // The matrix _does_ own the graph, so fill the local graph at
4508 // the same time as the local matrix.
4509 this->fillLocalGraphAndMatrix (params);
4510
4511 const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4512 params->get ("compute global constants", true);
4513 if (callGraphComputeGlobalConstants) {
4514 this->myGraph_->computeGlobalConstants ();
4515 }
4516 else {
4517 this->myGraph_->computeLocalConstants ();
4518 }
4519 this->myGraph_->fillComplete_ = true;
4520 this->myGraph_->checkInternalState ();
4521 }
4522
4523 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4524
4525 this->fillComplete_ = true; // Now we're fill complete!
4526 {
4527 Details::ProfilingRegion region_cis(
4528 "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4529 );
4530 this->checkInternalState ();
4531 }
4532 }
4533
4534 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4535 void
4537 expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4538 const Teuchos::RCP<const map_type> & rangeMap,
4539 const Teuchos::RCP<const import_type>& importer,
4540 const Teuchos::RCP<const export_type>& exporter,
4541 const Teuchos::RCP<Teuchos::ParameterList> &params)
4542 {
4543#ifdef HAVE_TPETRA_MMM_TIMINGS
4544 std::string label;
4545 if(!params.is_null())
4546 label = params->get("Timer Label",label);
4547 std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4548 using Teuchos::TimeMonitor;
4549
4550 Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4551#endif
4552
4553 const char tfecfFuncName[] = "expertStaticFillComplete: ";
4554 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4555 std::runtime_error, "Matrix fill state must be active (isFillActive() "
4556 "must be true) before calling fillComplete().");
4557 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4558 myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4559
4560 {
4561#ifdef HAVE_TPETRA_MMM_TIMINGS
4562 Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4563#endif
4564 // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4565 myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4566 }
4567
4568 {
4569#ifdef HAVE_TPETRA_MMM_TIMINGS
4570 TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4571#endif
4572 // Fill the local graph and matrix
4573 fillLocalGraphAndMatrix (params);
4574 }
4575 // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4576
4577 // Now we're fill complete!
4578 fillComplete_ = true;
4579
4580 // Sanity checks at the end.
4581#ifdef HAVE_TPETRA_DEBUG
4582 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4583 ": We're at the end of fillComplete(), but isFillActive() is true. "
4584 "Please report this bug to the Tpetra developers.");
4585 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4586 ": We're at the end of fillComplete(), but isFillActive() is true. "
4587 "Please report this bug to the Tpetra developers.");
4588#endif // HAVE_TPETRA_DEBUG
4589 {
4590#ifdef HAVE_TPETRA_MMM_TIMINGS
4591 Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4592#endif
4593
4594 checkInternalState();
4595 }
4596 }
4597
4598 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4600 mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4601 {
4602 impl_scalar_type* rowValueIter = vals;
4603 // beg,end define a half-exclusive interval over which to iterate.
4604 LocalOrdinal* beg = cols;
4605 LocalOrdinal* end = cols + rowLen;
4606 LocalOrdinal* newend = beg;
4607 if (beg != end) {
4608 LocalOrdinal* cur = beg + 1;
4609 impl_scalar_type* vcur = rowValueIter + 1;
4610 impl_scalar_type* vend = rowValueIter;
4611 cur = beg+1;
4612 while (cur != end) {
4613 if (*cur != *newend) {
4614 // new entry; save it
4615 ++newend;
4616 ++vend;
4617 (*newend) = (*cur);
4618 (*vend) = (*vcur);
4619 }
4620 else {
4621 // old entry; merge it
4622 //(*vend) = f (*vend, *vcur);
4623 (*vend) += *vcur;
4624 }
4625 ++cur;
4626 ++vcur;
4627 }
4628 ++newend; // one past the last entry, per typical [beg,end) semantics
4629 }
4630 return newend - beg;
4631 }
4632
4633 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4634 void
4636 sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4637 {
4638 using ::Tpetra::Details::ProfilingRegion;
4639 typedef LocalOrdinal LO;
4640 typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4641 host_execution_space;
4642 typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4643 const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4644 ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4645
4646 if (! sorted || ! merged) {
4647 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4648 (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4649 "\"static\" (const) graph, since the matrix does not own the graph.");
4650 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4651 (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4652 "this matrix claims ! isStaticGraph(). "
4653 "Please report this bug to the Tpetra developers.");
4654 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4655 (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4656 "this method if the graph's storage has already been optimized. "
4657 "Please report this bug to the Tpetra developers.");
4658
4659 crs_graph_type& graph = * (this->myGraph_);
4660 const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4661 size_t totalNumDups = 0;
4662 {
4663 //Accessing host unpacked (4-array CRS) local matrix.
4664 auto rowBegins_ = graph.rowPtrsUnpacked_host_;
4665 auto rowLengths_ = graph.k_numRowEntries_;
4666 auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4667 auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4668 Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4669 [=] (const LO lclRow, size_t& numDups) {
4670 size_t rowBegin = rowBegins_(lclRow);
4671 size_t rowLen = rowLengths_(lclRow);
4672 LO* cols = cols_.data() + rowBegin;
4673 impl_scalar_type* vals = vals_.data() + rowBegin;
4674 if (! sorted) {
4675 sort2 (cols, cols + rowLen, vals);
4676 }
4677 if (! merged) {
4678 size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4679 rowLengths_(lclRow) = newRowLength;
4680 numDups += rowLen - newRowLength;
4681 }
4682 }, totalNumDups);
4683 }
4684 if (! sorted) {
4685 graph.indicesAreSorted_ = true; // we just sorted every row
4686 }
4687 if (! merged) {
4688 graph.noRedundancies_ = true; // we just merged every row
4689 }
4690 }
4691 }
4692
4693 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4694 void
4698 Scalar alpha,
4699 Scalar beta) const
4700 {
4702 using Teuchos::RCP;
4703 using Teuchos::rcp;
4704 using Teuchos::rcp_const_cast;
4705 using Teuchos::rcpFromRef;
4706 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4707 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4708
4709 // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4710 // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4711 // failing only for the Kokkos refactor version of Tpetra. It's a
4712 // good idea regardless to have the bypass.
4713 if (alpha == ZERO) {
4714 if (beta == ZERO) {
4715 Y_in.putScalar (ZERO);
4716 } else if (beta != ONE) {
4717 Y_in.scale (beta);
4718 }
4719 return;
4720 }
4721
4722 // It's possible that X is a view of Y or vice versa. We don't
4723 // allow this (apply() requires that X and Y not alias one
4724 // another), but it's helpful to detect and work around this case.
4725 // We don't try to to detect the more subtle cases (e.g., one is a
4726 // subview of the other, but their initial pointers differ). We
4727 // only need to do this if this matrix's Import is trivial;
4728 // otherwise, we don't actually apply the operator from X into Y.
4729
4730 RCP<const import_type> importer = this->getGraph ()->getImporter ();
4731 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4732
4733 // If beta == 0, then the output MV will be overwritten; none of
4734 // its entries should be read. (Sparse BLAS semantics say that we
4735 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4736 // This matters if we need to do an Export operation; see below.
4737 const bool Y_is_overwritten = (beta == ZERO);
4738
4739 // We treat the case of a replicated MV output specially.
4740 const bool Y_is_replicated =
4741 (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4742
4743 // This is part of the special case for replicated MV output.
4744 // We'll let each process do its thing, but do an all-reduce at
4745 // the end to sum up the results. Setting beta=0 on all processes
4746 // but Proc 0 makes the math work out for the all-reduce. (This
4747 // assumes that the replicated data is correctly replicated, so
4748 // that the data are the same on all processes.)
4749 if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4750 beta = ZERO;
4751 }
4752
4753 // Temporary MV for Import operation. After the block of code
4754 // below, this will be an (Imported if necessary) column Map MV
4755 // ready to give to localApply(...).
4756 RCP<const MV> X_colMap;
4757 if (importer.is_null ()) {
4758 if (! X_in.isConstantStride ()) {
4759 // Not all sparse mat-vec kernels can handle an input MV with
4760 // nonconstant stride correctly, so we have to copy it in that
4761 // case into a constant stride MV. To make a constant stride
4762 // copy of X_in, we force creation of the column (== domain)
4763 // Map MV (if it hasn't already been created, else fetch the
4764 // cached copy). This avoids creating a new MV each time.
4765 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4766 Tpetra::deep_copy (*X_colMapNonConst, X_in);
4767 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4768 }
4769 else {
4770 // The domain and column Maps are the same, so do the local
4771 // multiply using the domain Map input MV X_in.
4772 X_colMap = rcpFromRef (X_in);
4773 }
4774 }
4775 else { // need to Import source (multi)vector
4776 ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4777
4778 // We're doing an Import anyway, which will copy the relevant
4779 // elements of the domain Map MV X_in into a separate column Map
4780 // MV. Thus, we don't have to worry whether X_in is constant
4781 // stride.
4782 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4783
4784 // Import from the domain Map MV to the column Map MV.
4785 X_colMapNonConst->doImport (X_in, *importer, INSERT);
4786 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4787 }
4788
4789 // Temporary MV for doExport (if needed), or for copying a
4790 // nonconstant stride output MV into a constant stride MV. This
4791 // is null if we don't need the temporary MV, that is, if the
4792 // Export is trivial (null).
4793 RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4794
4795 // If we have a nontrivial Export object, we must perform an
4796 // Export. In that case, the local multiply result will go into
4797 // the row Map multivector. We don't have to make a
4798 // constant-stride version of Y_in in this case, because we had to
4799 // make a constant stride Y_rowMap MV and do an Export anyway.
4800 if (! exporter.is_null ()) {
4801 this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4802 {
4803 ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4804
4805 // If we're overwriting the output MV Y_in completely (beta ==
4806 // 0), then make sure that it is filled with zeros before we
4807 // do the Export. Otherwise, the ADD combine mode will use
4808 // data in Y_in, which is supposed to be zero.
4809 if (Y_is_overwritten) {
4810 Y_in.putScalar (ZERO);
4811 }
4812 else {
4813 // Scale output MV by beta, so that doExport sums in the
4814 // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4815 Y_in.scale (beta);
4816 }
4817 // Do the Export operation.
4818 Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4819 }
4820 }
4821 else { // Don't do an Export: row Map and range Map are the same.
4822 //
4823 // If Y_in does not have constant stride, or if the column Map
4824 // MV aliases Y_in, then we can't let the kernel write directly
4825 // to Y_in. Instead, we have to use the cached row (== range)
4826 // Map MV as temporary storage.
4827 //
4828 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4829 // the user passed in the same MultiVector for both X and Y. It
4830 // won't detect whether one MultiVector views the other. We
4831 // should also check the MultiVectors' raw data pointers.
4832 if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4833 // Force creating the MV if it hasn't been created already.
4834 // This will reuse a previously created cached MV.
4835 Y_rowMap = getRowMapMultiVector (Y_in, true);
4836
4837 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4838 // since we're overwriting it anyway.
4839 if (beta != ZERO) {
4840 Tpetra::deep_copy (*Y_rowMap, Y_in);
4841 }
4842 this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4843 Tpetra::deep_copy (Y_in, *Y_rowMap);
4844 }
4845 else {
4846 this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4847 }
4848 }
4849
4850 // If the range Map is a locally replicated Map, sum up
4851 // contributions from each process. We set beta = 0 on all
4852 // processes but Proc 0 initially, so this will handle the scaling
4853 // factor beta correctly.
4854 if (Y_is_replicated) {
4855 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4856 Y_in.reduce ();
4857 }
4858 }
4859
4860 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4861 void
4865 const Teuchos::ETransp mode,
4866 Scalar alpha,
4867 Scalar beta) const
4868 {
4870 using Teuchos::null;
4871 using Teuchos::RCP;
4872 using Teuchos::rcp;
4873 using Teuchos::rcp_const_cast;
4874 using Teuchos::rcpFromRef;
4875 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4876
4877 // Take shortcuts for alpha == 0.
4878 if (alpha == ZERO) {
4879 // Follow the Sparse BLAS convention by ignoring both the matrix
4880 // and X_in, in this case.
4881 if (beta == ZERO) {
4882 // Follow the Sparse BLAS convention by overwriting any Inf or
4883 // NaN values in Y_in, in this case.
4884 Y_in.putScalar (ZERO);
4885 }
4886 else {
4887 Y_in.scale (beta);
4888 }
4889 return;
4890 }
4891
4892 const size_t numVectors = X_in.getNumVectors ();
4893
4894 // We don't allow X_in and Y_in to alias one another. It's hard
4895 // to check this, because advanced users could create views from
4896 // raw pointers. However, if X_in and Y_in reference the same
4897 // object, we will do the user a favor by copying X into new
4898 // storage (with a warning). We only need to do this if we have
4899 // trivial importers; otherwise, we don't actually apply the
4900 // operator from X into Y.
4901 RCP<const import_type> importer = this->getGraph ()->getImporter ();
4902 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4903 // access X indirectly, in case we need to create temporary storage
4904 RCP<const MV> X;
4905
4906 // some parameters for below
4907 const bool Y_is_replicated = ! Y_in.isDistributed ();
4908 const bool Y_is_overwritten = (beta == ZERO);
4909 if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4910 beta = ZERO;
4911 }
4912
4913 // The kernels do not allow input or output with nonconstant stride.
4914 if (! X_in.isConstantStride () && importer.is_null ()) {
4915 X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4916 } else {
4917 X = rcpFromRef (X_in); // Reference to X_in
4918 }
4919
4920 // Set up temporary multivectors for Import and/or Export.
4921 if (importer != Teuchos::null) {
4922 if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4923 importMV_ = null;
4924 }
4925 if (importMV_ == null) {
4926 importMV_ = rcp (new MV (this->getColMap (), numVectors));
4927 }
4928 }
4929 if (exporter != Teuchos::null) {
4930 if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4931 exportMV_ = null;
4932 }
4933 if (exportMV_ == null) {
4934 exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4935 }
4936 }
4937
4938 // If we have a non-trivial exporter, we must import elements that
4939 // are permuted or are on other processors.
4940 if (! exporter.is_null ()) {
4941 ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4942 exportMV_->doImport (X_in, *exporter, INSERT);
4943 X = exportMV_; // multiply out of exportMV_
4944 }
4945
4946 // If we have a non-trivial importer, we must export elements that
4947 // are permuted or belong to other processors. We will compute
4948 // solution into the to-be-exported MV; get a view.
4949 if (importer != Teuchos::null) {
4950 ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4951
4952 // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4953 // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4954 // back and figure out why this helps. importMV_ SHOULD be
4955 // completely overwritten in the localApply(...) call
4956 // below, because beta == ZERO there.
4957 importMV_->putScalar (ZERO);
4958 // Do the local computation.
4959 this->localApply (*X, *importMV_, mode, alpha, ZERO);
4960
4961 if (Y_is_overwritten) {
4962 Y_in.putScalar (ZERO);
4963 } else {
4964 Y_in.scale (beta);
4965 }
4966 Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4967 }
4968 // otherwise, multiply into Y
4969 else {
4970 // can't multiply in-situ; can't multiply into non-strided multivector
4971 //
4972 // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4973 // the user passed in the same MultiVector for both X and Y. It
4974 // won't detect whether one MultiVector views the other. We
4975 // should also check the MultiVectors' raw data pointers.
4976 if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4977 // Make a deep copy of Y_in, into which to write the multiply result.
4978 MV Y (Y_in, Teuchos::Copy);
4979 this->localApply (*X, Y, mode, alpha, beta);
4980 Tpetra::deep_copy (Y_in, Y);
4981 } else {
4982 this->localApply (*X, Y_in, mode, alpha, beta);
4983 }
4984 }
4985
4986 // If the range Map is a locally replicated map, sum the
4987 // contributions from each process. (That's why we set beta=0
4988 // above for all processes but Proc 0.)
4989 if (Y_is_replicated) {
4990 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4991 Y_in.reduce ();
4992 }
4993 }
4994
4995 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4996 void
5000 const Teuchos::ETransp mode,
5001 const Scalar& alpha,
5002 const Scalar& beta) const
5003 {
5005 using Teuchos::NO_TRANS;
5006 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5007
5008 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5009 auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5010 auto matrix_lcl = getLocalMultiplyOperator();
5011
5012 const bool debug = ::Tpetra::Details::Behavior::debug ();
5013 if (debug) {
5014 const char tfecfFuncName[] = "localApply: ";
5015 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5016 (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5017 "X.getNumVectors() = " << X.getNumVectors () << " != "
5018 "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5019 const bool transpose = (mode != Teuchos::NO_TRANS);
5020 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5021 (! transpose && X.getLocalLength () !=
5022 getColMap ()->getLocalNumElements (), std::runtime_error,
5023 "NO_TRANS case: X has the wrong number of local rows. "
5024 "X.getLocalLength() = " << X.getLocalLength () << " != "
5025 "getColMap()->getLocalNumElements() = " <<
5026 getColMap ()->getLocalNumElements () << ".");
5027 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5028 (! transpose && Y.getLocalLength () !=
5029 getRowMap ()->getLocalNumElements (), std::runtime_error,
5030 "NO_TRANS case: Y has the wrong number of local rows. "
5031 "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5032 "getRowMap()->getLocalNumElements() = " <<
5033 getRowMap ()->getLocalNumElements () << ".");
5034 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5035 (transpose && X.getLocalLength () !=
5036 getRowMap ()->getLocalNumElements (), std::runtime_error,
5037 "TRANS or CONJ_TRANS case: X has the wrong number of local "
5038 "rows. X.getLocalLength() = " << X.getLocalLength ()
5039 << " != getRowMap()->getLocalNumElements() = "
5040 << getRowMap ()->getLocalNumElements () << ".");
5041 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5042 (transpose && Y.getLocalLength () !=
5043 getColMap ()->getLocalNumElements (), std::runtime_error,
5044 "TRANS or CONJ_TRANS case: X has the wrong number of local "
5045 "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5046 << " != getColMap()->getLocalNumElements() = "
5047 << getColMap ()->getLocalNumElements () << ".");
5048 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5049 (! isFillComplete (), std::runtime_error, "The matrix is not "
5050 "fill complete. You must call fillComplete() (possibly with "
5051 "domain and range Map arguments) without an intervening "
5052 "resumeFill() call before you may call this method.");
5053 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5054 (! X.isConstantStride () || ! Y.isConstantStride (),
5055 std::runtime_error, "X and Y must be constant stride.");
5056 // If the two pointers are null, then they don't alias one
5057 // another, even though they are equal.
5058 // Kokkos does not guarantee that zero row-extent vectors
5059 // point to different places, so we have to check that too.
5060 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5061 (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5062 && X_lcl.extent(0) != 0,
5063 std::runtime_error, "X and Y may not alias one another.");
5064 }
5065
5066 LocalOrdinal nrows = getLocalNumRows();
5067 LocalOrdinal maxRowImbalance = 0;
5068 if(nrows != 0)
5069 maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5070
5071 if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5072 matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5073 else
5074 matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5075 }
5076
5077 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5078 void
5082 Teuchos::ETransp mode,
5083 Scalar alpha,
5084 Scalar beta) const
5085 {
5087 const char fnName[] = "Tpetra::CrsMatrix::apply";
5088
5089 TEUCHOS_TEST_FOR_EXCEPTION
5090 (! isFillComplete (), std::runtime_error,
5091 fnName << ": Cannot call apply() until fillComplete() "
5092 "has been called.");
5093
5094 if (mode == Teuchos::NO_TRANS) {
5095 ProfilingRegion regionNonTranspose (fnName);
5096 this->applyNonTranspose (X, Y, alpha, beta);
5097 }
5098 else {
5099 ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5100
5101 //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
5102 //when bets==0. This was not the case with transpose in a multithreaded
5103 //environment where a multiplication with subsequent atomic_adds is used
5104 //since 0 is effectively not special cased. Doing the explicit set to zero here
5105 //This catches cases where Y is nan or inf.
5106 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5107 if (beta == ZERO) {
5108 Y.putScalar (ZERO);
5109 }
5110 this->applyTranspose (X, Y, mode, alpha, beta);
5111 }
5112 }
5113
5114
5115 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5116 template<class T>
5117 Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5119 convert () const
5120 {
5121 using Teuchos::RCP;
5122 typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5123 const char tfecfFuncName[] = "convert: ";
5124
5125 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5126 (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5127 "of the conversion) is not fill complete. You must first call "
5128 "fillComplete() (possibly with the domain and range Map) without an "
5129 "intervening call to resumeFill(), before you may call this method.");
5130
5131 RCP<output_matrix_type> newMatrix
5132 (new output_matrix_type (this->getCrsGraph ()));
5133 // Copy old values into new values. impl_scalar_type and T may
5134 // differ, so we can't use Kokkos::deep_copy.
5135 using ::Tpetra::Details::copyConvert;
5136 copyConvert (newMatrix->getLocalMatrixDevice ().values,
5137 this->getLocalMatrixDevice ().values);
5138 // Since newmat has a static (const) graph, the graph already has
5139 // a column Map, and Import and Export objects already exist (if
5140 // applicable). Thus, calling fillComplete is cheap.
5141 newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5142
5143 return newMatrix;
5144 }
5145
5146
5147 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5148 void
5150 checkInternalState () const
5151 {
5152 const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5153 if (debug) {
5154 const char tfecfFuncName[] = "checkInternalState: ";
5155 const char err[] = "Internal state is not consistent. "
5156 "Please report this bug to the Tpetra developers.";
5157
5158 // This version of the graph (RCP<const crs_graph_type>) must
5159 // always be nonnull.
5160 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5161 (staticGraph_.is_null (), std::logic_error, err);
5162 // myGraph == null means that the matrix has a const ("static")
5163 // graph. Otherwise, the matrix has a dynamic graph (it owns its
5164 // graph).
5165 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5166 (! myGraph_.is_null () && myGraph_ != staticGraph_,
5167 std::logic_error, err);
5168 // if matrix is fill complete, then graph must be fill complete
5169 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5170 (isFillComplete () && ! staticGraph_->isFillComplete (),
5171 std::logic_error, err << " Specifically, the matrix is fill complete, "
5172 "but its graph is NOT fill complete.");
5173 // if values are allocated and they are non-zero in number, then
5174 // one of the allocations should be present
5175 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5176 (staticGraph_->indicesAreAllocated () &&
5177 staticGraph_->getLocalAllocationSize() > 0 &&
5178 staticGraph_->getLocalNumRows() > 0 &&
5179 valuesUnpacked_wdv.extent (0) == 0,
5180 std::logic_error, err);
5181 }
5182 }
5183
5184 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5185 std::string
5187 description () const
5188 {
5189 std::ostringstream os;
5190
5191 os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5192 if (this->getObjectLabel () != "") {
5193 os << "Label: \"" << this->getObjectLabel () << "\", ";
5194 }
5195 if (isFillComplete ()) {
5196 os << "isFillComplete: true"
5197 << ", global dimensions: [" << getGlobalNumRows () << ", "
5198 << getGlobalNumCols () << "]"
5199 << ", global number of entries: " << getGlobalNumEntries ()
5200 << "}";
5201 }
5202 else {
5203 os << "isFillComplete: false"
5204 << ", global dimensions: [" << getGlobalNumRows () << ", "
5205 << getGlobalNumCols () << "]}";
5206 }
5207 return os.str ();
5208 }
5209
5210 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5211 void
5213 describe (Teuchos::FancyOStream &out,
5214 const Teuchos::EVerbosityLevel verbLevel) const
5215 {
5216 using std::endl;
5217 using std::setw;
5218 using Teuchos::ArrayView;
5219 using Teuchos::Comm;
5220 using Teuchos::RCP;
5221 using Teuchos::TypeNameTraits;
5222 using Teuchos::VERB_DEFAULT;
5223 using Teuchos::VERB_NONE;
5224 using Teuchos::VERB_LOW;
5225 using Teuchos::VERB_MEDIUM;
5226 using Teuchos::VERB_HIGH;
5227 using Teuchos::VERB_EXTREME;
5228
5229 const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5230
5231 if (vl == VERB_NONE) {
5232 return; // Don't print anything at all
5233 }
5234
5235 // By convention, describe() always begins with a tab.
5236 Teuchos::OSTab tab0 (out);
5237
5238 RCP<const Comm<int> > comm = this->getComm();
5239 const int myRank = comm->getRank();
5240 const int numProcs = comm->getSize();
5241 size_t width = 1;
5242 for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5243 ++width;
5244 }
5245 width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5246
5247 // none: print nothing
5248 // low: print O(1) info from node 0
5249 // medium: print O(P) info, num entries per process
5250 // high: print O(N) info, num entries per row
5251 // extreme: print O(NNZ) info: print indices and values
5252 //
5253 // for medium and higher, print constituent objects at specified verbLevel
5254 if (myRank == 0) {
5255 out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5256 }
5257 Teuchos::OSTab tab1 (out);
5258
5259 if (myRank == 0) {
5260 if (this->getObjectLabel () != "") {
5261 out << "Label: \"" << this->getObjectLabel () << "\", ";
5262 }
5263 {
5264 out << "Template parameters:" << endl;
5265 Teuchos::OSTab tab2 (out);
5266 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5267 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5268 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5269 << "Node: " << TypeNameTraits<Node>::name () << endl;
5270 }
5271 if (isFillComplete()) {
5272 out << "isFillComplete: true" << endl
5273 << "Global dimensions: [" << getGlobalNumRows () << ", "
5274 << getGlobalNumCols () << "]" << endl
5275 << "Global number of entries: " << getGlobalNumEntries () << endl
5276 << endl << "Global max number of entries in a row: "
5277 << getGlobalMaxNumRowEntries () << endl;
5278 }
5279 else {
5280 out << "isFillComplete: false" << endl
5281 << "Global dimensions: [" << getGlobalNumRows () << ", "
5282 << getGlobalNumCols () << "]" << endl;
5283 }
5284 }
5285
5286 if (vl < VERB_MEDIUM) {
5287 return; // all done!
5288 }
5289
5290 // Describe the row Map.
5291 if (myRank == 0) {
5292 out << endl << "Row Map:" << endl;
5293 }
5294 if (getRowMap ().is_null ()) {
5295 if (myRank == 0) {
5296 out << "null" << endl;
5297 }
5298 }
5299 else {
5300 if (myRank == 0) {
5301 out << endl;
5302 }
5303 getRowMap ()->describe (out, vl);
5304 }
5305
5306 // Describe the column Map.
5307 if (myRank == 0) {
5308 out << "Column Map: ";
5309 }
5310 if (getColMap ().is_null ()) {
5311 if (myRank == 0) {
5312 out << "null" << endl;
5313 }
5314 } else if (getColMap () == getRowMap ()) {
5315 if (myRank == 0) {
5316 out << "same as row Map" << endl;
5317 }
5318 } else {
5319 if (myRank == 0) {
5320 out << endl;
5321 }
5322 getColMap ()->describe (out, vl);
5323 }
5324
5325 // Describe the domain Map.
5326 if (myRank == 0) {
5327 out << "Domain Map: ";
5328 }
5329 if (getDomainMap ().is_null ()) {
5330 if (myRank == 0) {
5331 out << "null" << endl;
5332 }
5333 } else if (getDomainMap () == getRowMap ()) {
5334 if (myRank == 0) {
5335 out << "same as row Map" << endl;
5336 }
5337 } else if (getDomainMap () == getColMap ()) {
5338 if (myRank == 0) {
5339 out << "same as column Map" << endl;
5340 }
5341 } else {
5342 if (myRank == 0) {
5343 out << endl;
5344 }
5345 getDomainMap ()->describe (out, vl);
5346 }
5347
5348 // Describe the range Map.
5349 if (myRank == 0) {
5350 out << "Range Map: ";
5351 }
5352 if (getRangeMap ().is_null ()) {
5353 if (myRank == 0) {
5354 out << "null" << endl;
5355 }
5356 } else if (getRangeMap () == getDomainMap ()) {
5357 if (myRank == 0) {
5358 out << "same as domain Map" << endl;
5359 }
5360 } else if (getRangeMap () == getRowMap ()) {
5361 if (myRank == 0) {
5362 out << "same as row Map" << endl;
5363 }
5364 } else {
5365 if (myRank == 0) {
5366 out << endl;
5367 }
5368 getRangeMap ()->describe (out, vl);
5369 }
5370
5371 // O(P) data
5372 for (int curRank = 0; curRank < numProcs; ++curRank) {
5373 if (myRank == curRank) {
5374 out << "Process rank: " << curRank << endl;
5375 Teuchos::OSTab tab2 (out);
5376 if (! staticGraph_->indicesAreAllocated ()) {
5377 out << "Graph indices not allocated" << endl;
5378 }
5379 else {
5380 out << "Number of allocated entries: "
5381 << staticGraph_->getLocalAllocationSize () << endl;
5382 }
5383 out << "Number of entries: " << getLocalNumEntries () << endl
5384 << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5385 << endl;
5386 }
5387 // Give output time to complete by executing some barriers.
5388 comm->barrier ();
5389 comm->barrier ();
5390 comm->barrier ();
5391 }
5392
5393 if (vl < VERB_HIGH) {
5394 return; // all done!
5395 }
5396
5397 // O(N) and O(NNZ) data
5398 for (int curRank = 0; curRank < numProcs; ++curRank) {
5399 if (myRank == curRank) {
5400 out << std::setw(width) << "Proc Rank"
5401 << std::setw(width) << "Global Row"
5402 << std::setw(width) << "Num Entries";
5403 if (vl == VERB_EXTREME) {
5404 out << std::setw(width) << "(Index,Value)";
5405 }
5406 out << endl;
5407 for (size_t r = 0; r < getLocalNumRows (); ++r) {
5408 const size_t nE = getNumEntriesInLocalRow(r);
5409 GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5410 out << std::setw(width) << myRank
5411 << std::setw(width) << gid
5412 << std::setw(width) << nE;
5413 if (vl == VERB_EXTREME) {
5414 if (isGloballyIndexed()) {
5415 global_inds_host_view_type rowinds;
5416 values_host_view_type rowvals;
5417 getGlobalRowView (gid, rowinds, rowvals);
5418 for (size_t j = 0; j < nE; ++j) {
5419 out << " (" << rowinds[j]
5420 << ", " << rowvals[j]
5421 << ") ";
5422 }
5423 }
5424 else if (isLocallyIndexed()) {
5425 local_inds_host_view_type rowinds;
5426 values_host_view_type rowvals;
5427 getLocalRowView (r, rowinds, rowvals);
5428 for (size_t j=0; j < nE; ++j) {
5429 out << " (" << getColMap()->getGlobalElement(rowinds[j])
5430 << ", " << rowvals[j]
5431 << ") ";
5432 }
5433 } // globally or locally indexed
5434 } // vl == VERB_EXTREME
5435 out << endl;
5436 } // for each row r on this process
5437 } // if (myRank == curRank)
5438
5439 // Give output time to complete
5440 comm->barrier ();
5441 comm->barrier ();
5442 comm->barrier ();
5443 } // for each process p
5444 }
5445
5446 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5447 bool
5449 checkSizes (const SrcDistObject& source)
5450 {
5451 // It's not clear what kind of compatibility checks on sizes can
5452 // be performed here. Epetra_CrsGraph doesn't check any sizes for
5453 // compatibility.
5454
5455 // Currently, the source object must be a RowMatrix with the same
5456 // four template parameters as the target CrsMatrix. We might
5457 // relax this requirement later.
5458 const row_matrix_type* srcRowMat =
5459 dynamic_cast<const row_matrix_type*> (&source);
5460 return (srcRowMat != nullptr);
5461 }
5462
5463 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5464 void
5467 const typename crs_graph_type::padding_type& padding,
5468 const bool verbose)
5469 {
5471 using Details::padCrsArrays;
5472 using std::endl;
5473 using LO = local_ordinal_type;
5474 using row_ptrs_type =
5475 typename local_graph_device_type::row_map_type::non_const_type;
5476 using range_policy =
5477 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5478 const char tfecfFuncName[] = "applyCrsPadding";
5479 const char suffix[] =
5480 ". Please report this bug to the Tpetra developers.";
5481 ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5482
5483 std::unique_ptr<std::string> prefix;
5484 if (verbose) {
5485 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5486 std::ostringstream os;
5487 os << *prefix << "padding: ";
5488 padding.print(os);
5489 os << endl;
5490 std::cerr << os.str();
5491 }
5492 const int myRank = ! verbose ? -1 : [&] () {
5493 auto map = this->getMap();
5494 if (map.is_null()) {
5495 return -1;
5496 }
5497 auto comm = map->getComm();
5498 if (comm.is_null()) {
5499 return -1;
5500 }
5501 return comm->getRank();
5502 } ();
5503
5504 // NOTE (mfh 29 Jan 2020) This allocates the values array.
5505 if (! myGraph_->indicesAreAllocated()) {
5506 if (verbose) {
5507 std::ostringstream os;
5508 os << *prefix << "Call allocateIndices" << endl;
5509 std::cerr << os.str();
5510 }
5511 allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5512 }
5513
5514 // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5515 // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5516 // size needs to increase. That should be the job of
5517 // padCrsArrays.
5518
5519 // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5520 // would use it directly.
5521
5522 if (verbose) {
5523 std::ostringstream os;
5524 os << *prefix << "Allocate row_ptrs_beg: "
5525 << myGraph_->rowPtrsUnpacked_host_.extent(0) << endl;
5526 std::cerr << os.str();
5527 }
5528 using Kokkos::view_alloc;
5529 using Kokkos::WithoutInitializing;
5530 row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5531 myGraph_->rowPtrsUnpacked_dev_.extent(0));
5532 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5533 Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5534
5535 const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5536 size_t(row_ptr_beg.extent(0) - 1);
5537 if (verbose) {
5538 std::ostringstream os;
5539 os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5540 std::cerr << os.str();
5541 }
5542 row_ptrs_type row_ptr_end(
5543 view_alloc("row_ptr_end", WithoutInitializing), N);
5544
5545 row_ptrs_type num_row_entries_d;
5546
5547 const bool refill_num_row_entries =
5548 myGraph_->k_numRowEntries_.extent(0) != 0;
5549
5550 if (refill_num_row_entries) { // unpacked storage
5551 // We can't assume correct *this capture until C++17, and it's
5552 // likely more efficient just to capture what we need anyway.
5553 num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5554 myGraph_->k_numRowEntries_);
5555 Kokkos::parallel_for
5556 ("Fill end row pointers", range_policy(0, N),
5557 KOKKOS_LAMBDA (const size_t i) {
5558 row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5559 });
5560 }
5561 else {
5562 // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5563 // storage, we don't need row_ptr_end to be separate allocation;
5564 // could just have it alias row_ptr_beg+1.
5565 Kokkos::parallel_for
5566 ("Fill end row pointers", range_policy(0, N),
5567 KOKKOS_LAMBDA (const size_t i) {
5568 row_ptr_end(i) = row_ptr_beg(i+1);
5569 });
5570 }
5571
5572 if (myGraph_->isGloballyIndexed()) {
5573 padCrsArrays(row_ptr_beg, row_ptr_end,
5574 myGraph_->gblInds_wdv,
5575 valuesUnpacked_wdv, padding, myRank, verbose);
5576 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5577 const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5578 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5579 (newValuesLen != newColIndsLen, std::logic_error,
5580 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5581 << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5582 << suffix);
5583 }
5584 else {
5585 padCrsArrays(row_ptr_beg, row_ptr_end,
5586 myGraph_->lclIndsUnpacked_wdv,
5587 valuesUnpacked_wdv, padding, myRank, verbose);
5588 const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5589 const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5590 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5591 (newValuesLen != newColIndsLen, std::logic_error,
5592 ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5593 << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5594 << suffix);
5595 }
5596
5597 if (refill_num_row_entries) {
5598 Kokkos::parallel_for
5599 ("Fill num entries", range_policy(0, N),
5600 KOKKOS_LAMBDA (const size_t i) {
5601 num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5602 });
5603 Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5604 }
5605
5606 if (verbose) {
5607 std::ostringstream os;
5608 os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5609 << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5610 << ", new size: " << row_ptr_beg.extent(0) << endl;
5611 std::cerr << os.str();
5612 TEUCHOS_ASSERT( myGraph_->rowPtrsUnpacked_host_.extent(0) ==
5613 row_ptr_beg.extent(0) );
5614 }
5615 myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5616 }
5617
5618 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5619 void
5620 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5621 copyAndPermuteStaticGraph(
5622 const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5623 const size_t numSameIDs,
5624 const LocalOrdinal permuteToLIDs[],
5625 const LocalOrdinal permuteFromLIDs[],
5626 const size_t numPermutes)
5627 {
5628 using Details::ProfilingRegion;
5629 using Teuchos::Array;
5630 using Teuchos::ArrayView;
5631 using std::endl;
5632 using LO = LocalOrdinal;
5633 using GO = GlobalOrdinal;
5634 const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5635 const char suffix[] =
5636 " Please report this bug to the Tpetra developers.";
5637 ProfilingRegion regionCAP
5638 ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5639
5640 const bool debug = Details::Behavior::debug("CrsGraph");
5641 const bool verbose = Details::Behavior::verbose("CrsGraph");
5642 std::unique_ptr<std::string> prefix;
5643 if (verbose) {
5644 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5645 std::ostringstream os;
5646 os << *prefix << "Start" << endl;
5647 }
5648 const char* const prefix_raw =
5649 verbose ? prefix.get()->c_str() : nullptr;
5650
5651 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5652 //
5653 // Copy the first numSame row from source to target (this matrix).
5654 // This involves copying rows corresponding to LIDs [0, numSame-1].
5655 //
5656 const map_type& srcRowMap = * (srcMat.getRowMap ());
5657 nonconst_global_inds_host_view_type rowInds;
5658 nonconst_values_host_view_type rowVals;
5659 const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5660 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5661 // Global ID for the current row index in the source matrix.
5662 // The first numSameIDs GIDs in the two input lists are the
5663 // same, so sourceGID == targetGID in this case.
5664 const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5665 const GO targetGID = sourceGID;
5666
5667 ArrayView<const GO>rowIndsConstView;
5668 ArrayView<const Scalar> rowValsConstView;
5669
5670 if (sourceIsLocallyIndexed) {
5671 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5672 if (rowLength > static_cast<size_t> (rowInds.size())) {
5673 Kokkos::resize(rowInds,rowLength);
5674 Kokkos::resize(rowVals,rowLength);
5675 }
5676 // Resizing invalidates an Array's views, so we must make new
5677 // ones, even if rowLength hasn't changed.
5678 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5679 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5680
5681 // The source matrix is locally indexed, so we have to get a
5682 // copy. Really it's the GIDs that have to be copied (because
5683 // they have to be converted from LIDs).
5684 size_t checkRowLength = 0;
5685 srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5686 rowValsView, checkRowLength);
5687 if (debug) {
5688 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5689 (rowLength != checkRowLength, std::logic_error, "For "
5690 "global row index " << sourceGID << ", the source "
5691 "matrix's getNumEntriesInGlobalRow returns a row length "
5692 "of " << rowLength << ", but getGlobalRowCopy reports "
5693 "a row length of " << checkRowLength << "." << suffix);
5694 }
5695
5696 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5697 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5698 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5699 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5700 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5701 rowIndsView.data(), rowIndsView.extent(0),
5702 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5703 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5704 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5705 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5706 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5707 // KDDKDD UVM TEMPORARY: KokkosView interface
5708 }
5709 else { // source matrix is globally indexed.
5710 global_inds_host_view_type rowIndsView;
5711 values_host_view_type rowValsView;
5712 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5713 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5714 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5715 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5716 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5717 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5718 rowIndsView.data(), rowIndsView.extent(0),
5719 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5720 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5721 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5722 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5723 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5724 // KDDKDD UVM TEMPORARY: KokkosView interface
5725
5726 }
5727
5728 // Applying a permutation to a matrix with a static graph
5729 // means REPLACE-ing entries.
5730 combineGlobalValues(targetGID, rowIndsConstView,
5731 rowValsConstView, REPLACE,
5732 prefix_raw, debug, verbose);
5733 }
5734
5735 if (verbose) {
5736 std::ostringstream os;
5737 os << *prefix << "Do permutes" << endl;
5738 }
5739
5740 const map_type& tgtRowMap = * (this->getRowMap ());
5741 for (size_t p = 0; p < numPermutes; ++p) {
5742 const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5743 const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5744
5745 ArrayView<const GO> rowIndsConstView;
5746 ArrayView<const Scalar> rowValsConstView;
5747
5748 if (sourceIsLocallyIndexed) {
5749 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5750 if (rowLength > static_cast<size_t> (rowInds.size ())) {
5751 Kokkos::resize(rowInds,rowLength);
5752 Kokkos::resize(rowVals,rowLength);
5753 }
5754 // Resizing invalidates an Array's views, so we must make new
5755 // ones, even if rowLength hasn't changed.
5756 nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5757 nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5758
5759 // The source matrix is locally indexed, so we have to get a
5760 // copy. Really it's the GIDs that have to be copied (because
5761 // they have to be converted from LIDs).
5762 size_t checkRowLength = 0;
5763 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5764 rowValsView, checkRowLength);
5765 if (debug) {
5766 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5767 (rowLength != checkRowLength, std::logic_error, "For "
5768 "source matrix global row index " << sourceGID << ", "
5769 "getNumEntriesInGlobalRow returns a row length of " <<
5770 rowLength << ", but getGlobalRowCopy a row length of "
5771 << checkRowLength << "." << suffix);
5772 }
5773
5774 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5775 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5776 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5777 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5778 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5779 rowIndsView.data(), rowIndsView.extent(0),
5780 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5781 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5782 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5783 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5784 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5785 // KDDKDD UVM TEMPORARY: KokkosView interface
5786 }
5787 else {
5788 global_inds_host_view_type rowIndsView;
5789 values_host_view_type rowValsView;
5790 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5791 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5792 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5793 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5794 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5795 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5796 rowIndsView.data(), rowIndsView.extent(0),
5797 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5798 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5799 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5800 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5801 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5802 // KDDKDD UVM TEMPORARY: KokkosView interface
5803 }
5804
5805 combineGlobalValues(targetGID, rowIndsConstView,
5806 rowValsConstView, REPLACE,
5807 prefix_raw, debug, verbose);
5808 }
5809
5810 if (verbose) {
5811 std::ostringstream os;
5812 os << *prefix << "Done" << endl;
5813 }
5814 }
5815
5816 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5817 void
5818 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5819 copyAndPermuteNonStaticGraph(
5820 const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5821 const size_t numSameIDs,
5822 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5823 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5824 const size_t numPermutes)
5825 {
5826 using Details::ProfilingRegion;
5827 using Teuchos::Array;
5828 using Teuchos::ArrayView;
5829 using std::endl;
5830 using LO = LocalOrdinal;
5831 using GO = GlobalOrdinal;
5832 const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5833 const char suffix[] =
5834 " Please report this bug to the Tpetra developers.";
5835 ProfilingRegion regionCAP
5836 ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5837
5838 const bool debug = Details::Behavior::debug("CrsGraph");
5839 const bool verbose = Details::Behavior::verbose("CrsGraph");
5840 std::unique_ptr<std::string> prefix;
5841 if (verbose) {
5842 prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5843 std::ostringstream os;
5844 os << *prefix << "Start" << endl;
5845 }
5846 const char* const prefix_raw =
5847 verbose ? prefix.get()->c_str() : nullptr;
5848
5849 {
5850 using row_graph_type = RowGraph<LO, GO, Node>;
5851 const row_graph_type& srcGraph = *(srcMat.getGraph());
5852 auto padding =
5853 myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5854 permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5855 applyCrsPadding(*padding, verbose);
5856 }
5857 const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5858 //
5859 // Copy the first numSame row from source to target (this matrix).
5860 // This involves copying rows corresponding to LIDs [0, numSame-1].
5861 //
5862 const map_type& srcRowMap = * (srcMat.getRowMap ());
5863 const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5864 using gids_type = nonconst_global_inds_host_view_type;
5865 using vals_type = nonconst_values_host_view_type;
5866 gids_type rowInds;
5867 vals_type rowVals;
5868 for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5869 // Global ID for the current row index in the source matrix.
5870 // The first numSameIDs GIDs in the two input lists are the
5871 // same, so sourceGID == targetGID in this case.
5872 const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5873 const GO targetGID = sourceGID;
5874
5875 ArrayView<const GO> rowIndsConstView;
5876 ArrayView<const Scalar> rowValsConstView;
5877
5878 if (sourceIsLocallyIndexed) {
5879
5880 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5881 if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5882 Kokkos::resize(rowInds,rowLength);
5883 Kokkos::resize(rowVals,rowLength);
5884 }
5885 // Resizing invalidates an Array's views, so we must make new
5886 // ones, even if rowLength hasn't changed.
5887 gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5888 vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5889
5890 // The source matrix is locally indexed, so we have to get a
5891 // copy. Really it's the GIDs that have to be copied (because
5892 // they have to be converted from LIDs).
5893 size_t checkRowLength = 0;
5894 srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5895 checkRowLength);
5896 if (debug) {
5897 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5898 (rowLength != checkRowLength, std::logic_error, ": For "
5899 "global row index " << sourceGID << ", the source "
5900 "matrix's getNumEntriesInGlobalRow returns a row length "
5901 "of " << rowLength << ", but getGlobalRowCopy reports "
5902 "a row length of " << checkRowLength << "." << suffix);
5903 }
5904 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5905 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5906 }
5907 else { // source matrix is globally indexed.
5908 global_inds_host_view_type rowIndsView;
5909 values_host_view_type rowValsView;
5910 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5911
5912 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5913 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5914 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5915 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5916 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5917 rowIndsView.data(), rowIndsView.extent(0),
5918 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5919 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5920 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5921 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5922 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5923 // KDDKDD UVM TEMPORARY: KokkosView interface
5924 }
5925
5926 // Combine the data into the target matrix.
5927 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5928 rowValsConstView, prefix_raw, debug, verbose);
5929 }
5930
5931 if (verbose) {
5932 std::ostringstream os;
5933 os << *prefix << "Do permutes" << endl;
5934 }
5935 const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5936 const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5937
5938 const map_type& tgtRowMap = * (this->getRowMap ());
5939 for (size_t p = 0; p < numPermutes; ++p) {
5940 const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5941 const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5942
5943 ArrayView<const GO> rowIndsConstView;
5944 ArrayView<const Scalar> rowValsConstView;
5945
5946 if (sourceIsLocallyIndexed) {
5947 const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5948 if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5949 Kokkos::resize(rowInds,rowLength);
5950 Kokkos::resize(rowVals,rowLength);
5951 }
5952 // Resizing invalidates an Array's views, so we must make new
5953 // ones, even if rowLength hasn't changed.
5954 gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5955 vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5956
5957 // The source matrix is locally indexed, so we have to get a
5958 // copy. Really it's the GIDs that have to be copied (because
5959 // they have to be converted from LIDs).
5960 size_t checkRowLength = 0;
5961 srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5962 rowValsView, checkRowLength);
5963 if (debug) {
5964 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5965 (rowLength != checkRowLength, std::logic_error, "For "
5966 "source matrix global row index " << sourceGID << ", "
5967 "getNumEntriesInGlobalRow returns a row length of " <<
5968 rowLength << ", but getGlobalRowCopy a row length of "
5969 << checkRowLength << "." << suffix);
5970 }
5971 rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5972 rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5973 }
5974 else {
5975 global_inds_host_view_type rowIndsView;
5976 values_host_view_type rowValsView;
5977 srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5978
5979 // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5980 // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5981 // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5982 // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5983 rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5984 rowIndsView.data(), rowIndsView.extent(0),
5985 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5986 rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5987 reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5988 Teuchos::RCP_DISABLE_NODE_LOOKUP);
5989 // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5990 // KDDKDD UVM TEMPORARY: KokkosView interface
5991 }
5992
5993 // Combine the data into the target matrix.
5994 insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5995 rowValsConstView, prefix_raw, debug, verbose);
5996 }
5997
5998 if (verbose) {
5999 std::ostringstream os;
6000 os << *prefix << "Done" << endl;
6001 }
6002 }
6003
6004 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6005 void
6006 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6007 copyAndPermute(
6008 const SrcDistObject& srcObj,
6009 const size_t numSameIDs,
6010 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6011 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6012 const CombineMode /*CM*/)
6013 {
6014 using Details::Behavior;
6015 using Details::dualViewStatusToString;
6016 using Details::ProfilingRegion;
6017 using std::endl;
6018
6019 // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6020 const char tfecfFuncName[] = "copyAndPermute: ";
6021 ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6022
6023 const bool verbose = Behavior::verbose("CrsMatrix");
6024 std::unique_ptr<std::string> prefix;
6025 if (verbose) {
6026 prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6027 std::ostringstream os;
6028 os << *prefix << endl
6029 << *prefix << " numSameIDs: " << numSameIDs << endl
6030 << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6031 << endl
6032 << *prefix << " "
6033 << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6034 << endl
6035 << *prefix << " "
6036 << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6037 << endl
6038 << *prefix << " "
6039 << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6040 << endl;
6041 std::cerr << os.str ();
6042 }
6043
6044 const auto numPermute = permuteToLIDs.extent (0);
6045 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6046 (numPermute != permuteFromLIDs.extent (0),
6047 std::invalid_argument, "permuteToLIDs.extent(0) = "
6048 << numPermute << "!= permuteFromLIDs.extent(0) = "
6049 << permuteFromLIDs.extent (0) << ".");
6050
6051 // This dynamic cast should succeed, because we've already tested
6052 // it in checkSizes().
6053 using RMT = RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
6054 const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6055 if (isStaticGraph ()) {
6056 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6057 auto permuteToLIDs_h = permuteToLIDs.view_host ();
6058 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6059 auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6060
6061 copyAndPermuteStaticGraph(srcMat, numSameIDs,
6062 permuteToLIDs_h.data(),
6063 permuteFromLIDs_h.data(),
6064 numPermute);
6065 }
6066 else {
6067 copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6068 permuteFromLIDs, numPermute);
6069 }
6070
6071 if (verbose) {
6072 std::ostringstream os;
6073 os << *prefix << "Done" << endl;
6074 std::cerr << os.str();
6075 }
6076 }
6077
6078 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6079 void
6080 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6081 packAndPrepare
6082 (const SrcDistObject& source,
6083 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6084 Kokkos::DualView<char*, buffer_device_type>& exports,
6085 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6086 size_t& constantNumPackets)
6087 {
6088 using Details::Behavior;
6089 using Details::dualViewStatusToString;
6090 using Details::ProfilingRegion;
6091 using Teuchos::outArg;
6092 using Teuchos::REDUCE_MAX;
6093 using Teuchos::reduceAll;
6094 using std::endl;
6095 typedef LocalOrdinal LO;
6096 typedef GlobalOrdinal GO;
6097 const char tfecfFuncName[] = "packAndPrepare: ";
6098 ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6099
6100 const bool debug = Behavior::debug("CrsMatrix");
6101 const bool verbose = Behavior::verbose("CrsMatrix");
6102
6103 // Processes on which the communicator is null should not participate.
6104 Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6105 if (pComm.is_null ()) {
6106 return;
6107 }
6108 const Teuchos::Comm<int>& comm = *pComm;
6109 const int myRank = comm.getSize ();
6110
6111 std::unique_ptr<std::string> prefix;
6112 if (verbose) {
6113 prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6114 std::ostringstream os;
6115 os << *prefix << "Start" << endl
6116 << *prefix << " "
6117 << dualViewStatusToString (exportLIDs, "exportLIDs")
6118 << endl
6119 << *prefix << " "
6120 << dualViewStatusToString (exports, "exports")
6121 << endl
6122 << *prefix << " "
6123 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6124 << endl;
6125 std::cerr << os.str ();
6126 }
6127
6128 // Attempt to cast the source object to CrsMatrix. If successful,
6129 // use the source object's packNew() method to pack its data for
6130 // communication. Otherwise, attempt to cast to RowMatrix; if
6131 // successful, use the source object's pack() method. Otherwise,
6132 // the source object doesn't have the right type.
6133 //
6134 // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6135 // RowMatrix to have the same Node type. Unfortunately, we don't
6136 // have a way to ask if the RowMatrix is "a RowMatrix with any
6137 // Node type," since RowMatrix doesn't have a base class. A
6138 // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6139 // not currently exist, would satisfy this requirement.
6140 //
6141 // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6142 // type doesn't technically need to match the target object's
6143 // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6144 // and GO need not be the same, as long as there is no overflow of
6145 // the indices. However, checking for index overflow is global
6146 // and therefore undesirable.
6147
6148 std::ostringstream msg; // for collecting error messages
6149 int lclBad = 0; // to be set below
6150
6151 using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6152 const crs_matrix_type* srcCrsMat =
6153 dynamic_cast<const crs_matrix_type*> (&source);
6154 if (srcCrsMat != nullptr) {
6155 if (verbose) {
6156 std::ostringstream os;
6157 os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6158 "calling packNew" << endl;
6159 std::cerr << os.str ();
6160 }
6161 try {
6162 srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6163 constantNumPackets);
6164 }
6165 catch (std::exception& e) {
6166 lclBad = 1;
6167 msg << "Proc " << myRank << ": " << e.what () << std::endl;
6168 }
6169 }
6170 else {
6171 using Kokkos::HostSpace;
6172 using Kokkos::subview;
6173 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6174 using range_type = Kokkos::pair<size_t, size_t>;
6175
6176 if (verbose) {
6177 std::ostringstream os;
6178 os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6179 << endl;
6180 std::cerr << os.str ();
6181 }
6182
6183 const row_matrix_type* srcRowMat =
6184 dynamic_cast<const row_matrix_type*> (&source);
6185 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6186 (srcRowMat == nullptr, std::invalid_argument,
6187 "The source object of the Import or Export operation is neither a "
6188 "CrsMatrix (with the same template parameters as the target object), "
6189 "nor a RowMatrix (with the same first four template parameters as the "
6190 "target object).");
6191
6192 // For the RowMatrix case, we need to convert from
6193 // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6194 // so terribly efficient, since packing a non-CrsMatrix
6195 // RowMatrix for Import/Export into a CrsMatrix is not a
6196 // critical case. Thus, we may allocate Teuchos::Array objects
6197 // here and copy to and from Kokkos::*View.
6198
6199 // View exportLIDs's host data as a Teuchos::ArrayView.
6200 TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6201 auto exportLIDs_h = exportLIDs.view_host ();
6202 Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6203 exportLIDs_h.size ());
6204
6205 // pack() will allocate exports_a as needed. We'll copy back
6206 // into exports (after (re)allocating exports if needed) below.
6207 Teuchos::Array<char> exports_a;
6208
6209 // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6210 // need to sync, since we're doing write-only access, but we do
6211 // need to mark the DualView as modified on host.
6212
6213 numPacketsPerLID.clear_sync_state (); // write-only access
6214 numPacketsPerLID.modify_host ();
6215 auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6216 Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6217 numPacketsPerLID_h.size ());
6218
6219 // Invoke RowMatrix's legacy pack() interface, using above
6220 // Teuchos::Array* objects.
6221 try {
6222 srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6223 constantNumPackets);
6224 }
6225 catch (std::exception& e) {
6226 lclBad = 1;
6227 msg << "Proc " << myRank << ": " << e.what () << std::endl;
6228 }
6229
6230 // Allocate 'exports', and copy exports_a back into it.
6231 const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6232 if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6233 const std::string oldLabel = exports.d_view.label ();
6234 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6235 exports = exports_type (newLabel, newAllocSize);
6236 }
6237 // It's safe to assume that we're working on host anyway, so
6238 // just keep exports sync'd to host.
6239 // ignore current device contents
6240 exports.modify_host();
6241
6242 auto exports_h = exports.view_host ();
6243 auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6244
6245 // Kokkos::deep_copy needs a Kokkos::View input, so turn
6246 // exports_a into a nonowning Kokkos::View first before copying.
6247 typedef typename exports_type::t_host::execution_space HES;
6248 typedef Kokkos::Device<HES, HostSpace> host_device_type;
6249 Kokkos::View<const char*, host_device_type>
6250 exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6251 // DEEP_COPY REVIEW - NOT TESTED
6252 Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6253 }
6254
6255 if (debug) {
6256 int gblBad = 0; // output argument; to be set below
6257 reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6258 if (gblBad != 0) {
6259 Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6260 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6261 (true, std::logic_error, "packNew() or pack() threw an exception on "
6262 "one or more participating processes.");
6263 }
6264 }
6265 else {
6266 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6267 (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6268 "or more participating processes. Here is this process' error "
6269 "message: " << msg.str ());
6270 }
6271
6272 if (verbose) {
6273 std::ostringstream os;
6274 os << *prefix << "packAndPrepare: Done!" << endl
6275 << *prefix << " "
6276 << dualViewStatusToString (exportLIDs, "exportLIDs")
6277 << endl
6278 << *prefix << " "
6279 << dualViewStatusToString (exports, "exports")
6280 << endl
6281 << *prefix << " "
6282 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6283 << endl;
6284 std::cerr << os.str ();
6285 }
6286 }
6287
6288 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6289 size_t
6290 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6291 packRow (char exports[],
6292 const size_t offset,
6293 const size_t numEnt,
6294 const GlobalOrdinal gidsIn[],
6295 const impl_scalar_type valsIn[],
6296 const size_t numBytesPerValue) const
6297 {
6298 using Kokkos::View;
6299 using Kokkos::subview;
6301 typedef LocalOrdinal LO;
6302 typedef GlobalOrdinal GO;
6303 typedef impl_scalar_type ST;
6304
6305 if (numEnt == 0) {
6306 // Empty rows always take zero bytes, to ensure sparsity.
6307 return 0;
6308 }
6309
6310 const GO gid = 0; // packValueCount wants this
6311 const LO numEntLO = static_cast<size_t> (numEnt);
6312
6313 const size_t numEntBeg = offset;
6314 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6315 const size_t gidsBeg = numEntBeg + numEntLen;
6316 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6317 const size_t valsBeg = gidsBeg + gidsLen;
6318 const size_t valsLen = numEnt * numBytesPerValue;
6319
6320 char* const numEntOut = exports + numEntBeg;
6321 char* const gidsOut = exports + gidsBeg;
6322 char* const valsOut = exports + valsBeg;
6323
6324 size_t numBytesOut = 0;
6325 int errorCode = 0;
6326 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6327
6328 {
6329 Kokkos::pair<int, size_t> p;
6330 p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6331 errorCode += p.first;
6332 numBytesOut += p.second;
6333
6334 p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6335 errorCode += p.first;
6336 numBytesOut += p.second;
6337 }
6338
6339 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6340 TEUCHOS_TEST_FOR_EXCEPTION
6341 (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6342 "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6343 << expectedNumBytes << ".");
6344 TEUCHOS_TEST_FOR_EXCEPTION
6345 (errorCode != 0, std::runtime_error, "packRow: "
6346 "PackTraits::packArray returned a nonzero error code");
6347
6348 return numBytesOut;
6349 }
6350
6351 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6352 size_t
6353 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6354 unpackRow (GlobalOrdinal gidsOut[],
6355 impl_scalar_type valsOut[],
6356 const char imports[],
6357 const size_t offset,
6358 const size_t numBytes,
6359 const size_t numEnt,
6360 const size_t numBytesPerValue)
6361 {
6362 using Kokkos::View;
6363 using Kokkos::subview;
6365 typedef LocalOrdinal LO;
6366 typedef GlobalOrdinal GO;
6367 typedef impl_scalar_type ST;
6368
6369 Details::ProfilingRegion region_upack_row(
6370 "Tpetra::CrsMatrix::unpackRow",
6371 "Import/Export"
6372 );
6373
6374 if (numBytes == 0) {
6375 // Rows with zero bytes should always have zero entries.
6376 if (numEnt != 0) {
6377 const int myRank = this->getMap ()->getComm ()->getRank ();
6378 TEUCHOS_TEST_FOR_EXCEPTION
6379 (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6380 "unpackRow: The number of bytes to unpack numBytes=0, but the "
6381 "number of entries to unpack (as reported by numPacketsPerLID) "
6382 "for this row numEnt=" << numEnt << " != 0.");
6383 }
6384 return 0;
6385 }
6386
6387 if (numEnt == 0 && numBytes != 0) {
6388 const int myRank = this->getMap ()->getComm ()->getRank ();
6389 TEUCHOS_TEST_FOR_EXCEPTION
6390 (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6391 "unpackRow: The number of entries to unpack (as reported by "
6392 "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6393 "numBytes=" << numBytes << " != 0.");
6394 }
6395
6396 const GO gid = 0; // packValueCount wants this
6397 const LO lid = 0; // packValueCount wants this
6398
6399 const size_t numEntBeg = offset;
6400 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6401 const size_t gidsBeg = numEntBeg + numEntLen;
6402 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6403 const size_t valsBeg = gidsBeg + gidsLen;
6404 const size_t valsLen = numEnt * numBytesPerValue;
6405
6406 const char* const numEntIn = imports + numEntBeg;
6407 const char* const gidsIn = imports + gidsBeg;
6408 const char* const valsIn = imports + valsBeg;
6409
6410 size_t numBytesOut = 0;
6411 int errorCode = 0;
6412 LO numEntOut;
6413 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6414 if (static_cast<size_t> (numEntOut) != numEnt ||
6415 numEntOut == static_cast<LO> (0)) {
6416 const int myRank = this->getMap ()->getComm ()->getRank ();
6417 std::ostringstream os;
6418 os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6419 bool firstErrorCondition = false;
6420 if (static_cast<size_t> (numEntOut) != numEnt) {
6421 os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6422 << " does not equal number of entries unpacked from imports "
6423 "buffer numEntOut=" << numEntOut << ".";
6424 firstErrorCondition = true;
6425 }
6426 if (numEntOut == static_cast<LO> (0)) {
6427 if (firstErrorCondition) {
6428 os << " Also, ";
6429 }
6430 os << "Number of entries unpacked from imports buffer numEntOut=0, "
6431 "but number of bytes to unpack for this row numBytes=" << numBytes
6432 << " != 0. This should never happen, since packRow should only "
6433 "ever pack rows with a nonzero number of entries. In this case, "
6434 "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6435 << ".";
6436 }
6437 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6438 }
6439
6440 {
6441 Kokkos::pair<int, size_t> p;
6442 p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6443 errorCode += p.first;
6444 numBytesOut += p.second;
6445
6446 p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6447 errorCode += p.first;
6448 numBytesOut += p.second;
6449 }
6450
6451 TEUCHOS_TEST_FOR_EXCEPTION
6452 (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6453 << numBytesOut << " != numBytes = " << numBytes << ".");
6454
6455 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6456 TEUCHOS_TEST_FOR_EXCEPTION
6457 (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6458 "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6459 << expectedNumBytes << ".");
6460
6461 TEUCHOS_TEST_FOR_EXCEPTION
6462 (errorCode != 0, std::runtime_error, "unpackRow: "
6463 "PackTraits::unpackArray returned a nonzero error code");
6464
6465 return numBytesOut;
6466 }
6467
6468 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6469 void
6470 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6471 allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6472 size_t& totalNumEntries,
6473 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6474 {
6475 using Details::Behavior;
6476 using Details::dualViewStatusToString;
6477 using std::endl;
6478 typedef impl_scalar_type IST;
6479 typedef LocalOrdinal LO;
6480 typedef GlobalOrdinal GO;
6481 //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6482
6483 // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6484 // output to std::cerr on every MPI process. This is unwise for
6485 // runs with large numbers of MPI processes.
6486 const bool verbose = Behavior::verbose("CrsMatrix");
6487 std::unique_ptr<std::string> prefix;
6488 if (verbose) {
6489 prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6490 std::ostringstream os;
6491 os << *prefix << "Before:"
6492 << endl
6493 << *prefix << " "
6494 << dualViewStatusToString (exports, "exports")
6495 << endl
6496 << *prefix << " "
6497 << dualViewStatusToString (exportLIDs, "exportLIDs")
6498 << endl;
6499 std::cerr << os.str ();
6500 }
6501
6502 // The number of export LIDs must fit in LocalOrdinal, assuming
6503 // that the LIDs are distinct and valid on the calling process.
6504 const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6505
6506 TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6507 auto exportLIDs_h = exportLIDs.view_host ();
6508
6509 // Count the total number of matrix entries to send.
6510 totalNumEntries = 0;
6511 for (LO i = 0; i < numExportLIDs; ++i) {
6512 const LO lclRow = exportLIDs_h[i];
6513 size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6514 // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6515 // indices as an error. Just consider them nonowned for now.
6516 if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6517 curNumEntries = 0;
6518 }
6519 totalNumEntries += curNumEntries;
6520 }
6521
6522 // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6523 // if sizeof(IST) is a meaningful representation of the amount of
6524 // data in a Scalar instance. (LO and GO are always built-in
6525 // integer types.)
6526 //
6527 // Allocate the exports array. It does NOT need padding for
6528 // alignment, since we use memcpy to write to / read from send /
6529 // receive buffers.
6530 const size_t allocSize =
6531 static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6532 totalNumEntries * (sizeof (IST) + sizeof (GO));
6533 if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6534 using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6535
6536 const std::string oldLabel = exports.d_view.label ();
6537 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6538 exports = exports_type (newLabel, allocSize);
6539 }
6540
6541 if (verbose) {
6542 std::ostringstream os;
6543 os << *prefix << "After:"
6544 << endl
6545 << *prefix << " "
6546 << dualViewStatusToString (exports, "exports")
6547 << endl
6548 << *prefix << " "
6549 << dualViewStatusToString (exportLIDs, "exportLIDs")
6550 << endl;
6551 std::cerr << os.str ();
6552 }
6553 }
6554
6555 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6556 void
6558 packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6559 Kokkos::DualView<char*, buffer_device_type>& exports,
6560 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6561 size_t& constantNumPackets) const
6562 {
6563 // The call to packNew in packAndPrepare catches and handles any exceptions.
6564 Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6565 if (this->isStaticGraph ()) {
6566 using ::Tpetra::Details::packCrsMatrixNew;
6567 packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6568 constantNumPackets);
6569 }
6570 else {
6571 this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6572 constantNumPackets);
6573 }
6574 }
6575
6576 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6577 void
6579 packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6580 Kokkos::DualView<char*, buffer_device_type>& exports,
6581 const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6582 size_t& constantNumPackets) const
6583 {
6584 using Details::Behavior;
6585 using Details::dualViewStatusToString;
6586 using Details::PackTraits;
6587 using Details::create_mirror_view_from_raw_host_array;
6588 using Kokkos::View;
6589 using std::endl;
6590 using LO = LocalOrdinal;
6591 using GO = GlobalOrdinal;
6592 using ST = impl_scalar_type;
6593 const char tfecfFuncName[] = "packNonStaticNew: ";
6594
6595 const bool verbose = Behavior::verbose("CrsMatrix");
6596 std::unique_ptr<std::string> prefix;
6597 if (verbose) {
6598 prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6599 std::ostringstream os;
6600 os << *prefix << "Start" << endl;
6601 std::cerr << os.str ();
6602 }
6603
6604 const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6605 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6606 (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6607 std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6608 << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6609 << ".");
6610
6611 // Setting this to zero tells the caller to expect a possibly
6612 // different ("nonconstant") number of packets per local index
6613 // (i.e., a possibly different number of entries per row).
6614 constantNumPackets = 0;
6615
6616 // The pack buffer 'exports' enters this method possibly
6617 // unallocated. Do the first two parts of "Count, allocate, fill,
6618 // compute."
6619 size_t totalNumEntries = 0;
6620 this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6621 const size_t bufSize = static_cast<size_t> (exports.extent (0));
6622
6623 // Write-only host access
6624 exports.clear_sync_state();
6625 exports.modify_host();
6626 auto exports_h = exports.view_host ();
6627 if (verbose) {
6628 std::ostringstream os;
6629 os << *prefix << "After marking exports as modified on host, "
6630 << dualViewStatusToString (exports, "exports") << endl;
6631 std::cerr << os.str ();
6632 }
6633
6634 // Read-only host access
6635 auto exportLIDs_h = exportLIDs.view_host ();
6636
6637 // Write-only host access
6638 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6639 const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6640 auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6641
6642 // Compute the number of "packets" (in this case, bytes) per
6643 // export LID (in this case, local index of the row to send), and
6644 // actually pack the data.
6645 auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6646
6647
6648 // Temporary buffer for global column indices.
6649 typename global_inds_host_view_type::non_const_type gidsIn_k;
6650 if (this->isLocallyIndexed()) { // Need storage for Global IDs
6651 gidsIn_k =
6652 typename global_inds_host_view_type::non_const_type("packGids",
6653 maxRowNumEnt);
6654 }
6655
6656 size_t offset = 0; // current index into 'exports' array.
6657 for (size_t i = 0; i < numExportLIDs; ++i) {
6658 const LO lclRow = exportLIDs_h[i];
6659
6660 size_t numBytes = 0;
6661 size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6662
6663 // Only pack this row's data if it has a nonzero number of
6664 // entries. We can do this because receiving processes get the
6665 // number of packets, and will know that zero packets means zero
6666 // entries.
6667 if (numEnt == 0) {
6668 numPacketsPerLID_h[i] = 0;
6669 continue;
6670 }
6671
6672 if (this->isLocallyIndexed ()) {
6673 typename global_inds_host_view_type::non_const_type gidsIn;
6674 values_host_view_type valsIn;
6675 // If the matrix is locally indexed on the calling process, we
6676 // have to use its column Map (which it _must_ have in this
6677 // case) to convert to global indices.
6678 local_inds_host_view_type lidsIn;
6679 this->getLocalRowView (lclRow, lidsIn, valsIn);
6680 const map_type& colMap = * (this->getColMap ());
6681 for (size_t k = 0; k < numEnt; ++k) {
6682 gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6683 }
6684 gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6685
6686 const size_t numBytesPerValue =
6687 PackTraits<ST>::packValueCount (valsIn[0]);
6688 numBytes = this->packRow (exports_h.data (), offset, numEnt,
6689 gidsIn.data (), valsIn.data (),
6690 numBytesPerValue);
6691 }
6692 else if (this->isGloballyIndexed ()) {
6693 global_inds_host_view_type gidsIn;
6694 values_host_view_type valsIn;
6695 // If the matrix is globally indexed on the calling process,
6696 // then we can use the column indices directly. However, we
6697 // have to get the global row index. The calling process must
6698 // have a row Map, since otherwise it shouldn't be participating
6699 // in packing operations.
6700 const map_type& rowMap = * (this->getRowMap ());
6701 const GO gblRow = rowMap.getGlobalElement (lclRow);
6702 this->getGlobalRowView (gblRow, gidsIn, valsIn);
6703
6704 const size_t numBytesPerValue =
6705 PackTraits<ST>::packValueCount (valsIn[0]);
6706 numBytes = this->packRow (exports_h.data (), offset, numEnt,
6707 gidsIn.data (), valsIn.data (),
6708 numBytesPerValue);
6709 }
6710 // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6711 // nor locally indexed, then it has no entries. Therefore,
6712 // there is nothing to pack. No worries!
6713
6714 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6715 (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6716 "First invalid offset into 'exports' pack buffer at index i = " << i
6717 << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6718 bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6719 ".");
6720 // numPacketsPerLID_h[i] is the number of "packets" in the
6721 // current local row i. Packet=char (really "byte") so use the
6722 // number of bytes of the packed data for that row.
6723 numPacketsPerLID_h[i] = numBytes;
6724 offset += numBytes;
6725 }
6726
6727 if (verbose) {
6728 std::ostringstream os;
6729 os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6730 << *prefix << " "
6731 << dualViewStatusToString (exports, "exports")
6732 << endl
6733 << *prefix << " "
6734 << dualViewStatusToString (exportLIDs, "exportLIDs")
6735 << endl;
6736 std::cerr << os.str ();
6737 }
6738 }
6739
6740 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6741 LocalOrdinal
6742 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6743 combineGlobalValuesRaw(const LocalOrdinal lclRow,
6744 const LocalOrdinal numEnt,
6745 const impl_scalar_type vals[],
6746 const GlobalOrdinal cols[],
6747 const Tpetra::CombineMode combMode,
6748 const char* const prefix,
6749 const bool debug,
6750 const bool verbose)
6751 {
6752 using GO = GlobalOrdinal;
6753
6754 // mfh 23 Mar 2017: This branch is not thread safe in a debug
6755 // build, due to use of Teuchos::ArrayView; see #229.
6756 const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6757 Teuchos::ArrayView<const GO> cols_av
6758 (numEnt == 0 ? nullptr : cols, numEnt);
6759 Teuchos::ArrayView<const Scalar> vals_av
6760 (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6761
6762 // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6763 // combine modes. combineGlobalValues throws on error; it does
6764 // not return an error code. Thus, if it returns, it succeeded.
6765 combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6766 prefix, debug, verbose);
6767 return numEnt;
6768 }
6769
6770 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6771 void
6772 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6773 combineGlobalValues(
6774 const GlobalOrdinal globalRowIndex,
6775 const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6776 const Teuchos::ArrayView<const Scalar>& values,
6777 const Tpetra::CombineMode combineMode,
6778 const char* const prefix,
6779 const bool debug,
6780 const bool verbose)
6781 {
6782 const char tfecfFuncName[] = "combineGlobalValues: ";
6783
6784 if (isStaticGraph ()) {
6785 // INSERT doesn't make sense for a static graph, since you
6786 // aren't allowed to change the structure of the graph.
6787 // However, all the other combine modes work.
6788 if (combineMode == ADD) {
6789 sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6790 }
6791 else if (combineMode == REPLACE) {
6792 replaceGlobalValues (globalRowIndex, columnIndices, values);
6793 }
6794 else if (combineMode == ABSMAX) {
6795 using ::Tpetra::Details::AbsMax;
6796 AbsMax<Scalar> f;
6797 this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6798 columnIndices,
6799 values, f);
6800 }
6801 else if (combineMode == INSERT) {
6802 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6803 (isStaticGraph() && combineMode == INSERT,
6804 std::invalid_argument, "INSERT combine mode is forbidden "
6805 "if the matrix has a static (const) graph (i.e., was "
6806 "constructed with the CrsMatrix constructor that takes a "
6807 "const CrsGraph pointer).");
6808 }
6809 else {
6810 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6811 (true, std::logic_error, "Invalid combine mode; should "
6812 "never get here! "
6813 "Please report this bug to the Tpetra developers.");
6814 }
6815 }
6816 else { // The matrix has a dynamic graph.
6817 if (combineMode == ADD || combineMode == INSERT) {
6818 // For a dynamic graph, all incoming column indices are
6819 // inserted into the target graph. Duplicate indices will
6820 // have their values summed. In this context, ADD and INSERT
6821 // are equivalent. We need to call insertGlobalValues()
6822 // anyway if the column indices don't yet exist in this row,
6823 // so we just call insertGlobalValues() for both cases.
6824 insertGlobalValuesFilteredChecked(globalRowIndex,
6825 columnIndices, values, prefix, debug, verbose);
6826 }
6827 // FIXME (mfh 14 Mar 2012):
6828 //
6829 // Implementing ABSMAX or REPLACE for a dynamic graph would
6830 // require modifying assembly to attach a possibly different
6831 // combine mode to each inserted (i, j, A_ij) entry. For
6832 // example, consider two different Export operations to the same
6833 // target CrsMatrix, the first with ABSMAX combine mode and the
6834 // second with REPLACE. This isn't a common use case, so we
6835 // won't mess with it for now.
6836 else if (combineMode == ABSMAX) {
6837 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6838 ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6839 "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6840 "implemented.");
6841 }
6842 else if (combineMode == REPLACE) {
6843 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6844 ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6845 "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6846 "implemented.");
6847 }
6848 else {
6849 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6850 true, std::logic_error, "Should never get here! Please report this "
6851 "bug to the Tpetra developers.");
6852 }
6853 }
6854 }
6855
6856 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6857 void
6860 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6861 Kokkos::DualView<char*, buffer_device_type> imports,
6862 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6863 const size_t constantNumPackets,
6864 const CombineMode combineMode)
6865 {
6866 using Details::Behavior;
6867 using Details::dualViewStatusToString;
6869 using std::endl;
6870 const char tfecfFuncName[] = "unpackAndCombine: ";
6871 ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6872
6873 const bool debug = Behavior::debug("CrsMatrix");
6874 const bool verbose = Behavior::verbose("CrsMatrix");
6875 constexpr int numValidModes = 5;
6876 const CombineMode validModes[numValidModes] =
6878 const char* validModeNames[numValidModes] =
6879 {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6880
6881 std::unique_ptr<std::string> prefix;
6882 if (verbose) {
6883 prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6884 std::ostringstream os;
6885 os << *prefix << "Start:" << endl
6886 << *prefix << " "
6887 << dualViewStatusToString (importLIDs, "importLIDs")
6888 << endl
6889 << *prefix << " "
6890 << dualViewStatusToString (imports, "imports")
6891 << endl
6892 << *prefix << " "
6893 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6894 << endl
6895 << *prefix << " constantNumPackets: " << constantNumPackets
6896 << endl
6897 << *prefix << " combineMode: " << combineModeToString (combineMode)
6898 << endl;
6899 std::cerr << os.str ();
6900 }
6901
6902 if (debug) {
6903 if (std::find (validModes, validModes+numValidModes, combineMode) ==
6904 validModes+numValidModes) {
6905 std::ostringstream os;
6906 os << "Invalid combine mode. Valid modes are {";
6907 for (int k = 0; k < numValidModes; ++k) {
6908 os << validModeNames[k];
6909 if (k < numValidModes - 1) {
6910 os << ", ";
6911 }
6912 }
6913 os << "}.";
6914 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6915 (true, std::invalid_argument, os.str ());
6916 }
6917 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6918 (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6919 std::invalid_argument, "importLIDs.extent(0)="
6920 << importLIDs.extent(0)
6921 << " != numPacketsPerLID.extent(0)="
6922 << numPacketsPerLID.extent(0) << ".");
6923 }
6924
6925 if (combineMode == ZERO) {
6926 return; // nothing to do
6927 }
6928
6929 if (debug) {
6930 using Teuchos::reduceAll;
6931 std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
6932 int lclBad = 0;
6933 try {
6934 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6935 constantNumPackets, combineMode,
6936 verbose);
6937 } catch (std::exception& e) {
6938 lclBad = 1;
6939 *msg << e.what ();
6940 }
6941 int gblBad = 0;
6942 const Teuchos::Comm<int>& comm = * (this->getComm ());
6943 reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
6944 lclBad, Teuchos::outArg (gblBad));
6945 if (gblBad != 0) {
6946 // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6947 // initialized in a debug build. Thus, we get the process
6948 // rank again here. This is an error message, so the small
6949 // run-time cost doesn't matter. See #1887.
6950 std::ostringstream os;
6951 os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
6952 msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
6953 ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
6954 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6955 (true, std::logic_error, std::endl << "unpackAndCombineImpl "
6956 "threw an exception on one or more participating processes: "
6957 << endl << msg->str ());
6958 }
6959 }
6960 else {
6961 unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6962 constantNumPackets, combineMode,
6963 verbose);
6964 }
6965
6966 if (verbose) {
6967 std::ostringstream os;
6968 os << *prefix << "Done!" << endl
6969 << *prefix << " "
6970 << dualViewStatusToString (importLIDs, "importLIDs")
6971 << endl
6972 << *prefix << " "
6973 << dualViewStatusToString (imports, "imports")
6974 << endl
6975 << *prefix << " "
6976 << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6977 << endl;
6978 std::cerr << os.str ();
6979 }
6980 }
6981
6982 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6983 void
6986 const Kokkos::DualView<const local_ordinal_type*,
6987 buffer_device_type>& importLIDs,
6988 Kokkos::DualView<char*, buffer_device_type> imports,
6989 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6990 const size_t constantNumPackets,
6991 const CombineMode combineMode,
6992 const bool verbose)
6993 {
6994 Details::ProfilingRegion region_unpack_and_combine_impl(
6995 "Tpetra::CrsMatrix::unpackAndCombineImpl",
6996 "Import/Export"
6997 );
6998 using std::endl;
6999 const char tfecfFuncName[] = "unpackAndCombineImpl";
7000 std::unique_ptr<std::string> prefix;
7001 if (verbose) {
7002 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7003 std::ostringstream os;
7004 os << *prefix << "isStaticGraph(): "
7005 << (isStaticGraph() ? "true" : "false")
7006 << ", importLIDs.extent(0): "
7007 << importLIDs.extent(0)
7008 << ", imports.extent(0): "
7009 << imports.extent(0)
7010 << ", numPacketsPerLID.extent(0): "
7011 << numPacketsPerLID.extent(0)
7012 << endl;
7013 std::cerr << os.str();
7014 }
7015
7016 if (isStaticGraph ()) {
7017 using Details::unpackCrsMatrixAndCombineNew;
7018 unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7019 importLIDs, constantNumPackets,
7020 combineMode);
7021 }
7022 else {
7023 {
7024 using padding_type = typename crs_graph_type::padding_type;
7025 std::unique_ptr<padding_type> padding;
7026 try {
7027 padding = myGraph_->computePaddingForCrsMatrixUnpack(
7028 importLIDs, imports, numPacketsPerLID, verbose);
7029 }
7030 catch (std::exception& e) {
7031 const auto rowMap = getRowMap();
7032 const auto comm = rowMap.is_null() ? Teuchos::null :
7033 rowMap->getComm();
7034 const int myRank = comm.is_null() ? -1 : comm->getRank();
7035 TEUCHOS_TEST_FOR_EXCEPTION
7036 (true, std::runtime_error, "Proc " << myRank << ": "
7037 "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7038 "threw an exception: " << e.what());
7039 }
7040 if (verbose) {
7041 std::ostringstream os;
7042 os << *prefix << "Call applyCrsPadding" << endl;
7043 std::cerr << os.str();
7044 }
7045 applyCrsPadding(*padding, verbose);
7046 }
7047 if (verbose) {
7048 std::ostringstream os;
7049 os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7050 std::cerr << os.str();
7051 }
7052 unpackAndCombineImplNonStatic(importLIDs, imports,
7053 numPacketsPerLID,
7054 constantNumPackets,
7055 combineMode);
7056 }
7057
7058 if (verbose) {
7059 std::ostringstream os;
7060 os << *prefix << "Done" << endl;
7061 std::cerr << os.str();
7062 }
7063 }
7064
7065 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7066 void
7067 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7068 unpackAndCombineImplNonStatic(
7069 const Kokkos::DualView<const local_ordinal_type*,
7070 buffer_device_type>& importLIDs,
7071 Kokkos::DualView<char*, buffer_device_type> imports,
7072 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7073 const size_t constantNumPackets,
7074 const CombineMode combineMode)
7075 {
7076 using Kokkos::View;
7077 using Kokkos::subview;
7078 using Kokkos::MemoryUnmanaged;
7079 using Details::Behavior;
7080 using Details::castAwayConstDualView;
7081 using Details::create_mirror_view_from_raw_host_array;
7082 using Details::PackTraits;
7083 using Details::ScalarViewTraits;
7084 using std::endl;
7085 using LO = LocalOrdinal;
7086 using GO = GlobalOrdinal;
7087 using ST = impl_scalar_type;
7088 using size_type = typename Teuchos::ArrayView<LO>::size_type;
7089 using HES =
7090 typename View<int*, device_type>::HostMirror::execution_space;
7091 using pair_type = std::pair<typename View<int*, HES>::size_type,
7092 typename View<int*, HES>::size_type>;
7093 using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7094 using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7095 const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7096
7097 const bool debug = Behavior::debug("CrsMatrix");
7098 const bool verbose = Behavior::verbose("CrsMatrix");
7099 std::unique_ptr<std::string> prefix;
7100 if (verbose) {
7101 prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7102 std::ostringstream os;
7103 os << *prefix << endl; // we've already printed DualViews' statuses
7104 std::cerr << os.str ();
7105 }
7106 const char* const prefix_raw =
7107 verbose ? prefix.get()->c_str() : nullptr;
7108
7109 const size_type numImportLIDs = importLIDs.extent (0);
7110 if (combineMode == ZERO || numImportLIDs == 0) {
7111 return; // nothing to do; no need to combine entries
7112 }
7113
7114 Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7115 "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7116 "Import/Export"
7117 );
7118
7119 // We're unpacking on host. This is read-only host access.
7120 if (imports.need_sync_host()) {
7121 imports.sync_host ();
7122 }
7123 auto imports_h = imports.view_host();
7124
7125 // Read-only host access.
7126 if (numPacketsPerLID.need_sync_host()) {
7127 numPacketsPerLID.sync_host ();
7128 }
7129 auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7130
7131 TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7132 auto importLIDs_h = importLIDs.view_host();
7133
7134 size_t numBytesPerValue;
7135 {
7136 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7137 // with run-time size? We already assume that all entries in both the
7138 // source and target matrices have the same size. If the calling process
7139 // owns at least one entry in either matrix, we can use that entry to set
7140 // the size. However, it is possible that the calling process owns no
7141 // entries. In that case, we're in trouble. One way to fix this would be
7142 // for each row's data to contain the run-time size. This is only
7143 // necessary if the size is not a compile-time constant.
7144 Scalar val;
7145 numBytesPerValue = PackTraits<ST>::packValueCount (val);
7146 }
7147
7148 // Determine the maximum number of entries in any one row
7149 size_t offset = 0;
7150 size_t maxRowNumEnt = 0;
7151 for (size_type i = 0; i < numImportLIDs; ++i) {
7152 const size_t numBytes = numPacketsPerLID_h[i];
7153 if (numBytes == 0) {
7154 continue; // empty buffer for that row means that the row is empty
7155 }
7156 // We need to unpack a nonzero number of entries for this row.
7157 if (debug) {
7158 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7159 (offset + numBytes > size_t(imports_h.extent (0)),
7160 std::logic_error, ": At local row index importLIDs_h[i="
7161 << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7162 << ") + numBytes (=" << numBytes << ") > "
7163 "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7164 }
7165 LO numEntLO = 0;
7166
7167 if (debug) {
7168 const size_t theNumBytes =
7169 PackTraits<LO>::packValueCount (numEntLO);
7170 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7171 (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7172 << theNumBytes << " > numBytes = " << numBytes << ".");
7173 }
7174 const char* const inBuf = imports_h.data () + offset;
7175 const size_t actualNumBytes =
7176 PackTraits<LO>::unpackValue (numEntLO, inBuf);
7177
7178 if (debug) {
7179 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7180 (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7181 << ", actualNumBytes=" << actualNumBytes
7182 << " > numBytes=" << numBytes << ".");
7183 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7184 (numEntLO == 0, std::logic_error, ": At local row index "
7185 "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7186 "the number of entries read from the packed data is "
7187 "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7188 << " != 0.");
7189 }
7190
7191 maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7192 offset += numBytes;
7193 }
7194
7195 // Temporary space to cache incoming global column indices and
7196 // values. Column indices come in as global indices, in case the
7197 // source object's column Map differs from the target object's
7198 // (this's) column Map.
7199 View<GO*, HES> gblColInds;
7200 View<LO*, HES> lclColInds;
7201 View<ST*, HES> vals;
7202 {
7203 GO gid = 0;
7204 LO lid = 0;
7205 // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7206 // with run-time size? We already assume that all entries in both the
7207 // source and target matrices have the same size. If the calling process
7208 // owns at least one entry in either matrix, we can use that entry to set
7209 // the size. However, it is possible that the calling process owns no
7210 // entries. In that case, we're in trouble. One way to fix this would be
7211 // for each row's data to contain the run-time size. This is only
7212 // necessary if the size is not a compile-time constant.
7213 Scalar val;
7214 gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7215 gid, maxRowNumEnt, "gids");
7216 lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7217 lid, maxRowNumEnt, "lids");
7218 vals = ScalarViewTraits<ST, HES>::allocateArray(
7219 val, maxRowNumEnt, "vals");
7220 }
7221
7222 offset = 0;
7223 for (size_type i = 0; i < numImportLIDs; ++i) {
7224 const size_t numBytes = numPacketsPerLID_h[i];
7225 if (numBytes == 0) {
7226 continue; // empty buffer for that row means that the row is empty
7227 }
7228 LO numEntLO = 0;
7229 const char* const inBuf = imports_h.data () + offset;
7230 (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7231
7232 const size_t numEnt = static_cast<size_t>(numEntLO);;
7233 const LO lclRow = importLIDs_h[i];
7234
7235 gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7236 vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7237
7238 const size_t numBytesOut =
7239 unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7240 offset, numBytes, numEnt, numBytesPerValue);
7241 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7242 (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7243 << ", numBytes=" << numBytes << " != numBytesOut="
7244 << numBytesOut << ".");
7245
7246 const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7247 const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7248 combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7249 combineMode, prefix_raw, debug, verbose);
7250 // Don't update offset until current LID has succeeded.
7251 offset += numBytes;
7252 } // for each import LID i
7253
7254 if (verbose) {
7255 std::ostringstream os;
7256 os << *prefix << "Done" << endl;
7257 std::cerr << os.str();
7258 }
7259 }
7260
7261 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7262 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7264 getColumnMapMultiVector (const MV& X_domainMap,
7265 const bool force) const
7266 {
7267 using Teuchos::null;
7268 using Teuchos::RCP;
7269 using Teuchos::rcp;
7270
7271 TEUCHOS_TEST_FOR_EXCEPTION(
7272 ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7273 "MapMultiVector: You may only call this method if the matrix has a "
7274 "column Map. If the matrix does not yet have a column Map, you should "
7275 "first call fillComplete (with domain and range Map if necessary).");
7276
7277 // If the graph is not fill complete, then the Import object (if
7278 // one should exist) hasn't been constructed yet.
7279 TEUCHOS_TEST_FOR_EXCEPTION(
7280 ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7281 "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7282 "this matrix's graph is fill complete.");
7283
7284 const size_t numVecs = X_domainMap.getNumVectors ();
7285 RCP<const import_type> importer = this->getGraph ()->getImporter ();
7286 RCP<const map_type> colMap = this->getColMap ();
7287
7288 RCP<MV> X_colMap; // null by default
7289
7290 // If the Import object is trivial (null), then we don't need a
7291 // separate column Map multivector. Just return null in that
7292 // case. The caller is responsible for knowing not to use the
7293 // returned null pointer.
7294 //
7295 // If the Import is nontrivial, then we do need a separate
7296 // column Map multivector for the Import operation. Check in
7297 // that case if we have to (re)create the column Map
7298 // multivector.
7299 if (! importer.is_null () || force) {
7300 if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7301 X_colMap = rcp (new MV (colMap, numVecs));
7302
7303 // Cache the newly created multivector for later reuse.
7304 importMV_ = X_colMap;
7305 }
7306 else { // Yay, we can reuse the cached multivector!
7307 X_colMap = importMV_;
7308 // mfh 09 Jan 2013: We don't have to fill with zeros first,
7309 // because the Import uses INSERT combine mode, which overwrites
7310 // existing entries.
7311 //
7312 //X_colMap->putScalar (ZERO);
7313 }
7314 }
7315 return X_colMap;
7316 }
7317
7318 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7319 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7322 const bool force) const
7323 {
7324 using Teuchos::null;
7325 using Teuchos::RCP;
7326 using Teuchos::rcp;
7327
7328 // If the graph is not fill complete, then the Export object (if
7329 // one should exist) hasn't been constructed yet.
7330 TEUCHOS_TEST_FOR_EXCEPTION(
7331 ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7332 "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7333 "matrix's graph is fill complete.");
7334
7335 const size_t numVecs = Y_rangeMap.getNumVectors ();
7336 RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7337 // Every version of the constructor takes either a row Map, or a
7338 // graph (all of whose constructors take a row Map). Thus, the
7339 // matrix always has a row Map.
7340 RCP<const map_type> rowMap = this->getRowMap ();
7341
7342 RCP<MV> Y_rowMap; // null by default
7343
7344 // If the Export object is trivial (null), then we don't need a
7345 // separate row Map multivector. Just return null in that case.
7346 // The caller is responsible for knowing not to use the returned
7347 // null pointer.
7348 //
7349 // If the Export is nontrivial, then we do need a separate row
7350 // Map multivector for the Export operation. Check in that case
7351 // if we have to (re)create the row Map multivector.
7352 if (! exporter.is_null () || force) {
7353 if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7354 Y_rowMap = rcp (new MV (rowMap, numVecs));
7355 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7356 }
7357 else { // Yay, we can reuse the cached multivector!
7358 Y_rowMap = exportMV_;
7359 }
7360 }
7361 return Y_rowMap;
7362 }
7363
7364 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7365 void
7367 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7368 {
7369 TEUCHOS_TEST_FOR_EXCEPTION(
7370 myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7371 "removeEmptyProcessesInPlace: This method does not work when the matrix "
7372 "was created with a constant graph (that is, when it was created using "
7373 "the version of its constructor that takes an RCP<const CrsGraph>). "
7374 "This is because the matrix is not allowed to modify the graph in that "
7375 "case, but removing empty processes requires modifying the graph.");
7376 myGraph_->removeEmptyProcessesInPlace (newMap);
7377 // Even though CrsMatrix's row Map (as returned by getRowMap())
7378 // comes from its CrsGraph, CrsMatrix still implements DistObject,
7379 // so we also have to change the DistObject's Map.
7380 this->map_ = this->getRowMap ();
7381 // In the nonconst graph case, staticGraph_ is just a const
7382 // pointer to myGraph_. This assignment is probably redundant,
7383 // but it doesn't hurt.
7384 staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7385 }
7386
7387 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7388 Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7390 add (const Scalar& alpha,
7392 const Scalar& beta,
7393 const Teuchos::RCP<const map_type>& domainMap,
7394 const Teuchos::RCP<const map_type>& rangeMap,
7395 const Teuchos::RCP<Teuchos::ParameterList>& params) const
7396 {
7397 using Teuchos::Array;
7398 using Teuchos::ArrayView;
7399 using Teuchos::ParameterList;
7400 using Teuchos::RCP;
7401 using Teuchos::rcp;
7402 using Teuchos::rcp_implicit_cast;
7403 using Teuchos::sublist;
7404 using std::endl;
7405 using LO = local_ordinal_type;
7406 using GO = global_ordinal_type;
7407 using crs_matrix_type =
7409 const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7410
7411 const bool debug = Details::Behavior::debug("CrsMatrix");
7412 const bool verbose = Details::Behavior::verbose("CrsMatrix");
7413 std::unique_ptr<std::string> prefix;
7414 if (verbose) {
7415 prefix = this->createPrefix("CrsMatrix", "add");
7416 std::ostringstream os;
7417 os << *prefix << "Start" << endl;
7418 std::cerr << os.str ();
7419 }
7420
7421 const crs_matrix_type& B = *this; // a convenient abbreviation
7422 const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7423 const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7424
7425 // If the user didn't supply a domain or range Map, then try to
7426 // get one from B first (if it has them), then from A (if it has
7427 // them). If we don't have any domain or range Maps, scold the
7428 // user.
7429 RCP<const map_type> A_domainMap = A.getDomainMap ();
7430 RCP<const map_type> A_rangeMap = A.getRangeMap ();
7431 RCP<const map_type> B_domainMap = B.getDomainMap ();
7432 RCP<const map_type> B_rangeMap = B.getRangeMap ();
7433
7434 RCP<const map_type> theDomainMap = domainMap;
7435 RCP<const map_type> theRangeMap = rangeMap;
7436
7437 if (domainMap.is_null ()) {
7438 if (B_domainMap.is_null ()) {
7439 TEUCHOS_TEST_FOR_EXCEPTION(
7440 A_domainMap.is_null (), std::invalid_argument,
7441 "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7442 "then you must supply a nonnull domain Map to this method.");
7443 theDomainMap = A_domainMap;
7444 } else {
7445 theDomainMap = B_domainMap;
7446 }
7447 }
7448 if (rangeMap.is_null ()) {
7449 if (B_rangeMap.is_null ()) {
7450 TEUCHOS_TEST_FOR_EXCEPTION(
7451 A_rangeMap.is_null (), std::invalid_argument,
7452 "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7453 "then you must supply a nonnull range Map to this method.");
7454 theRangeMap = A_rangeMap;
7455 } else {
7456 theRangeMap = B_rangeMap;
7457 }
7458 }
7459
7460 if (debug) {
7461 // In debug mode, check that A and B have matching domain and
7462 // range Maps, if they have domain and range Maps at all. (If
7463 // they aren't fill complete, then they may not yet have them.)
7464 if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7465 if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7466 TEUCHOS_TEST_FOR_EXCEPTION
7467 (! B_domainMap->isSameAs(*A_domainMap),
7468 std::invalid_argument,
7469 errPfx << "The input RowMatrix A must have a domain Map "
7470 "which is the same as (isSameAs) this RowMatrix's "
7471 "domain Map.");
7472 TEUCHOS_TEST_FOR_EXCEPTION
7473 (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7474 errPfx << "The input RowMatrix A must have a range Map "
7475 "which is the same as (isSameAs) this RowMatrix's range "
7476 "Map.");
7477 TEUCHOS_TEST_FOR_EXCEPTION
7478 (! domainMap.is_null() &&
7479 ! domainMap->isSameAs(*B_domainMap),
7480 std::invalid_argument,
7481 errPfx << "The input domain Map must be the same as "
7482 "(isSameAs) this RowMatrix's domain Map.");
7483 TEUCHOS_TEST_FOR_EXCEPTION
7484 (! rangeMap.is_null() &&
7485 ! rangeMap->isSameAs(*B_rangeMap),
7486 std::invalid_argument,
7487 errPfx << "The input range Map must be the same as "
7488 "(isSameAs) this RowMatrix's range Map.");
7489 }
7490 }
7491 else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7492 TEUCHOS_TEST_FOR_EXCEPTION
7493 (! domainMap.is_null() &&
7494 ! domainMap->isSameAs(*B_domainMap),
7495 std::invalid_argument,
7496 errPfx << "The input domain Map must be the same as "
7497 "(isSameAs) this RowMatrix's domain Map.");
7498 TEUCHOS_TEST_FOR_EXCEPTION
7499 (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7500 std::invalid_argument,
7501 errPfx << "The input range Map must be the same as "
7502 "(isSameAs) this RowMatrix's range Map.");
7503 }
7504 else {
7505 TEUCHOS_TEST_FOR_EXCEPTION
7506 (domainMap.is_null() || rangeMap.is_null(),
7507 std::invalid_argument, errPfx << "If neither A nor B "
7508 "have a domain and range Map, then you must supply a "
7509 "nonnull domain and range Map to this method.");
7510 }
7511 }
7512
7513 // What parameters do we pass to C's constructor? Do we call
7514 // fillComplete on C after filling it? And if so, what parameters
7515 // do we pass to C's fillComplete call?
7516 bool callFillComplete = true;
7517 RCP<ParameterList> constructorSublist;
7518 RCP<ParameterList> fillCompleteSublist;
7519 if (! params.is_null()) {
7520 callFillComplete =
7521 params->get("Call fillComplete", callFillComplete);
7522 constructorSublist = sublist(params, "Constructor parameters");
7523 fillCompleteSublist = sublist(params, "fillComplete parameters");
7524 }
7525
7526 RCP<const map_type> A_rowMap = A.getRowMap ();
7527 RCP<const map_type> B_rowMap = B.getRowMap ();
7528 RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7529 RCP<crs_matrix_type> C; // The result matrix.
7530
7531 // If A and B's row Maps are the same, we can compute an upper
7532 // bound on the number of entries in each row of C, before
7533 // actually computing the sum. A reasonable upper bound is the
7534 // sum of the two entry counts in each row.
7535 if (A_rowMap->isSameAs (*B_rowMap)) {
7536 const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7537 Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7538
7539 // Get the number of entries in each row of A.
7540 if (alpha != ZERO) {
7541 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7542 const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7543 C_maxNumEntriesPerRow[localRow] += A_numEntries;
7544 }
7545 }
7546 // Get the number of entries in each row of B.
7547 if (beta != ZERO) {
7548 for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7549 const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7550 C_maxNumEntriesPerRow[localRow] += B_numEntries;
7551 }
7552 }
7553 // Construct the result matrix C.
7554 if (constructorSublist.is_null ()) {
7555 C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7556 } else {
7557 C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7558 constructorSublist));
7559 }
7560 // Since A and B have the same row Maps, we could add them
7561 // together all at once and merge values before we call
7562 // insertGlobalValues. However, we don't really need to, since
7563 // we've already allocated enough space in each row of C for C
7564 // to do the merge itself.
7565 }
7566 else { // the row Maps of A and B are not the same
7567 // Construct the result matrix C.
7568 // true: !A_rowMap->isSameAs (*B_rowMap)
7569 TEUCHOS_TEST_FOR_EXCEPTION
7570 (true, std::invalid_argument, errPfx << "The row maps must "
7571 "be the same for statically allocated matrices, to ensure "
7572 "that there is sufficient space to do the addition.");
7573 }
7574
7575 TEUCHOS_TEST_FOR_EXCEPTION
7576 (C.is_null (), std::logic_error,
7577 errPfx << "C should not be null at this point. "
7578 "Please report this bug to the Tpetra developers.");
7579
7580 if (verbose) {
7581 std::ostringstream os;
7582 os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7583 std::cerr << os.str ();
7584 }
7585 using gids_type = nonconst_global_inds_host_view_type;
7586 using vals_type = nonconst_values_host_view_type;
7587 gids_type ind;
7588 vals_type val;
7589
7590 if (alpha != ZERO) {
7591 const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7592 for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7593 size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7594 const GO globalRow = A_rowMap->getGlobalElement (localRow);
7595 if (A_numEntries > static_cast<size_t> (ind.size ())) {
7596 Kokkos::resize(ind,A_numEntries);
7597 Kokkos::resize(val,A_numEntries);
7598 }
7599 gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7600 vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7601 A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7602
7603 if (alpha != ONE) {
7604 for (size_t k = 0; k < A_numEntries; ++k) {
7605 valView[k] *= alpha;
7606 }
7607 }
7608 C->insertGlobalValues (globalRow, A_numEntries,
7609 reinterpret_cast<Scalar *>(valView.data()),
7610 indView.data());
7611 }
7612 }
7613
7614 if (beta != ZERO) {
7615 const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7616 for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7617 size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7618 const GO globalRow = B_rowMap->getGlobalElement (localRow);
7619 if (B_numEntries > static_cast<size_t> (ind.size ())) {
7620 Kokkos::resize(ind,B_numEntries);
7621 Kokkos::resize(val,B_numEntries);
7622 }
7623 gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7624 vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7625 B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7626
7627 if (beta != ONE) {
7628 for (size_t k = 0; k < B_numEntries; ++k) {
7629 valView[k] *= beta;
7630 }
7631 }
7632 C->insertGlobalValues (globalRow, B_numEntries,
7633 reinterpret_cast<Scalar *>(valView.data()),
7634 indView.data());
7635 }
7636 }
7637
7638 if (callFillComplete) {
7639 if (verbose) {
7640 std::ostringstream os;
7641 os << *prefix << "Call fillComplete on C" << endl;
7642 std::cerr << os.str ();
7643 }
7644 if (fillCompleteSublist.is_null ()) {
7645 C->fillComplete (theDomainMap, theRangeMap);
7646 } else {
7647 C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7648 }
7649 }
7650 else if (verbose) {
7651 std::ostringstream os;
7652 os << *prefix << "Do NOT call fillComplete on C" << endl;
7653 std::cerr << os.str ();
7654 }
7655
7656 if (verbose) {
7657 std::ostringstream os;
7658 os << *prefix << "Done" << endl;
7659 std::cerr << os.str ();
7660 }
7661 return rcp_implicit_cast<row_matrix_type> (C);
7662 }
7663
7664
7665
7666 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7667 void
7670 const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7671 const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7672 const Teuchos::RCP<const map_type>& domainMap,
7673 const Teuchos::RCP<const map_type>& rangeMap,
7674 const Teuchos::RCP<Teuchos::ParameterList>& params) const
7675 {
7676 using Details::Behavior;
7677 using Details::getArrayViewFromDualView;
7678 using Details::packCrsMatrixWithOwningPIDs;
7679 using Details::unpackAndCombineWithOwningPIDsCount;
7680 using Details::unpackAndCombineIntoCrsArrays;
7681 using Teuchos::ArrayRCP;
7682 using Teuchos::ArrayView;
7683 using Teuchos::Comm;
7684 using Teuchos::ParameterList;
7685 using Teuchos::RCP;
7686 using std::endl;
7687 typedef LocalOrdinal LO;
7688 typedef GlobalOrdinal GO;
7689 typedef node_type NT;
7690 typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7691 typedef Vector<int, LO, GO, NT> IntVectorType;
7692 using Teuchos::as;
7693
7694 const bool debug = Behavior::debug("CrsMatrix");
7695 const bool verbose = Behavior::verbose("CrsMatrix");
7696 int MyPID = getComm ()->getRank ();
7697
7698 std::unique_ptr<std::string> verbosePrefix;
7699 if (verbose) {
7700 verbosePrefix =
7701 this->createPrefix("CrsMatrix", "transferAndFillComplete");
7702 std::ostringstream os;
7703 os << "Start" << endl;
7704 std::cerr << os.str();
7705 }
7706
7707 //
7708 // Get the caller's parameters
7709 //
7710 bool isMM = false; // optimize for matrix-matrix ops.
7711 bool reverseMode = false; // Are we in reverse mode?
7712 bool restrictComm = false; // Do we need to restrict the communicator?
7713
7714 int mm_optimization_core_count =
7715 Behavior::TAFC_OptimizationCoreCount();
7716 RCP<ParameterList> matrixparams; // parameters for the destination matrix
7717 bool overrideAllreduce = false;
7718 if (! params.is_null ()) {
7719 matrixparams = sublist (params, "CrsMatrix");
7720 reverseMode = params->get ("Reverse Mode", reverseMode);
7721 restrictComm = params->get ("Restrict Communicator", restrictComm);
7722 auto & slist = params->sublist("matrixmatrix: kernel params",false);
7723 isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7724 mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7725
7726 overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7727 if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7728 if(reverseMode) isMM = false;
7729 }
7730
7731 // Only used in the sparse matrix-matrix multiply (isMM) case.
7732 std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7733 int mismatch = 0;
7734 int reduced_mismatch = 0;
7735 if (isMM && !overrideAllreduce) {
7736
7737 // Test for pathological matrix transfer
7738 const bool source_vals = ! getGraph ()->getImporter ().is_null();
7739 const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7740 rowTransfer.getRemoteLIDs ().size() == 0);
7741 mismatch = (source_vals != target_vals) ? 1 : 0;
7742 iallreduceRequest =
7743 ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7744 Teuchos::REDUCE_MAX, * (getComm ()));
7745 }
7746
7747#ifdef HAVE_TPETRA_MMM_TIMINGS
7748 using Teuchos::TimeMonitor;
7749 std::string label;
7750 if(!params.is_null())
7751 label = params->get("Timer Label",label);
7752 std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7753 std::string tlstr;
7754 {
7755 std::ostringstream os;
7756 if(isMM) os<<":MMOpt";
7757 else os<<":MMLegacy";
7758 tlstr = os.str();
7759 }
7760
7761 Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7762#endif
7763
7764 // Make sure that the input argument rowTransfer is either an
7765 // Import or an Export. Import and Export are the only two
7766 // subclasses of Transfer that we defined, but users might
7767 // (unwisely, for now at least) decide to implement their own
7768 // subclasses. Exclude this possibility.
7769 const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7770 const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7771 TEUCHOS_TEST_FOR_EXCEPTION(
7772 xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7773 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7774 "argument must be either an Import or an Export, and its template "
7775 "parameters must match the corresponding template parameters of the "
7776 "CrsMatrix.");
7777
7778 // Make sure that the input argument domainTransfer is either an
7779 // Import or an Export. Import and Export are the only two
7780 // subclasses of Transfer that we defined, but users might
7781 // (unwisely, for now at least) decide to implement their own
7782 // subclasses. Exclude this possibility.
7783 Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7784 Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7785
7786 if(! domainTransfer.is_null()) {
7787 TEUCHOS_TEST_FOR_EXCEPTION(
7788 (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7789 "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7790 "argument must be either an Import or an Export, and its template "
7791 "parameters must match the corresponding template parameters of the "
7792 "CrsMatrix.");
7793
7794 TEUCHOS_TEST_FOR_EXCEPTION(
7795 ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7796 (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7797 ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7798 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7799 "arguments must be of the same type (either Import or Export).");
7800
7801 TEUCHOS_TEST_FOR_EXCEPTION(
7802 ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7803 (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7804 ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7805 "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7806 "arguments must be of the same type (either Import or Export).");
7807 } // domainTransfer != null
7808
7809
7810 // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7811 // if the source Map is not distributed but the target Map is?
7812 const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7813
7814 // Get the new domain and range Maps. We need some of them for
7815 // error checking, now that we have the reverseMode parameter.
7816 RCP<const map_type> MyRowMap = reverseMode ?
7817 rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7818 RCP<const map_type> MyColMap; // create this below
7819 RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7820 domainMap : getDomainMap ();
7821 RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7822 rangeMap : getRangeMap ();
7823 RCP<const map_type> BaseRowMap = MyRowMap;
7824 RCP<const map_type> BaseDomainMap = MyDomainMap;
7825
7826 // If the user gave us a nonnull destMat, then check whether it's
7827 // "pristine." That means that it has no entries.
7828 //
7829 // FIXME (mfh 15 May 2014) If this is not true on all processes,
7830 // then this exception test may hang. It would be better to
7831 // forward an error flag to the next communication phase.
7832 if (! destMat.is_null ()) {
7833 // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7834 // whether a graph or matrix has no entries on the calling
7835 // process, is that it is neither locally nor globally indexed.
7836 // This may change eventually with the Kokkos refactor version
7837 // of Tpetra, so it would be better just to check the quantity
7838 // of interest directly. Note that with the Kokkos refactor
7839 // version of Tpetra, asking for the total number of entries in
7840 // a graph or matrix that is not fill complete might require
7841 // computation (kernel launch), since it is not thread scalable
7842 // to update a count every time an entry is inserted.
7843 const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7844 ! destMat->getGraph ()->isGloballyIndexed ();
7845 TEUCHOS_TEST_FOR_EXCEPTION(
7846 ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7847 "transferAndFillComplete: The input argument 'destMat' is only allowed "
7848 "to be nonnull, if its graph is empty (neither locally nor globally "
7849 "indexed).");
7850 // FIXME (mfh 15 May 2014) At some point, we want to change
7851 // graphs and matrices so that their DistObject Map
7852 // (this->getMap()) may differ from their row Map. This will
7853 // make redistribution for 2-D distributions more efficient. I
7854 // hesitate to change this check, because I'm not sure how much
7855 // the code here depends on getMap() and getRowMap() being the
7856 // same.
7857 TEUCHOS_TEST_FOR_EXCEPTION(
7858 ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7859 "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7860 "input argument 'destMat' is not the same as the (row) Map specified "
7861 "by the input argument 'rowTransfer'.");
7862 TEUCHOS_TEST_FOR_EXCEPTION(
7863 ! destMat->checkSizes (*this), std::invalid_argument,
7864 "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7865 "destination matrix, but checkSizes() indicates that it is not a legal "
7866 "legal target for redistribution from the source matrix (*this). This "
7867 "may mean that they do not have the same dimensions.");
7868 }
7869
7870 // If forward mode (the default), then *this's (row) Map must be
7871 // the same as the source Map of the Transfer. If reverse mode,
7872 // then *this's (row) Map must be the same as the target Map of
7873 // the Transfer.
7874 //
7875 // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7876 // and matrices so that their DistObject Map (this->getMap()) may
7877 // differ from their row Map. This will make redistribution for
7878 // 2-D distributions more efficient. I hesitate to change this
7879 // check, because I'm not sure how much the code here depends on
7880 // getMap() and getRowMap() being the same.
7881 TEUCHOS_TEST_FOR_EXCEPTION(
7882 ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7883 std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7884 "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7885 TEUCHOS_TEST_FOR_EXCEPTION(
7886 ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7887 std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7888 "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7889
7890 // checks for domainTransfer
7891 TEUCHOS_TEST_FOR_EXCEPTION(
7892 ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7893 std::invalid_argument,
7894 "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7895 "argument must be the same as the rebalanced domain map 'domainMap'");
7896
7897 TEUCHOS_TEST_FOR_EXCEPTION(
7898 ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7899 std::invalid_argument,
7900 "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7901 "argument must be the same as the rebalanced domain map 'domainMap'");
7902
7903 // The basic algorithm here is:
7904 //
7905 // 1. Call the moral equivalent of "Distor.do" to handle the import.
7906 // 2. Copy all the Imported and Copy/Permuted data into the raw
7907 // CrsMatrix / CrsGraphData pointers, still using GIDs.
7908 // 3. Call an optimized version of MakeColMap that avoids the
7909 // Directory lookups (since the importer knows who owns all the
7910 // GIDs) AND reindexes to LIDs.
7911 // 4. Call expertStaticFillComplete()
7912
7913 // Get information from the Importer
7914 const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7915 ArrayView<const LO> ExportLIDs = reverseMode ?
7916 rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7917 ArrayView<const LO> RemoteLIDs = reverseMode ?
7918 rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
7919 ArrayView<const LO> PermuteToLIDs = reverseMode ?
7920 rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
7921 ArrayView<const LO> PermuteFromLIDs = reverseMode ?
7922 rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
7923 Distributor& Distor = rowTransfer.getDistributor ();
7924
7925 // Owning PIDs
7926 Teuchos::Array<int> SourcePids;
7927 Teuchos::Array<int> TargetPids;
7928
7929 // Temp variables for sub-communicators
7930 RCP<const map_type> ReducedRowMap, ReducedColMap,
7931 ReducedDomainMap, ReducedRangeMap;
7932 RCP<const Comm<int> > ReducedComm;
7933
7934 // If the user gave us a null destMat, then construct the new
7935 // destination matrix. We will replace its column Map later.
7936 if (destMat.is_null ()) {
7937 destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
7938 }
7939
7940 /***************************************************/
7941 /***** 1) First communicator restriction phase ****/
7942 /***************************************************/
7943 if (restrictComm) {
7944 ReducedRowMap = MyRowMap->removeEmptyProcesses ();
7945 ReducedComm = ReducedRowMap.is_null () ?
7946 Teuchos::null :
7947 ReducedRowMap->getComm ();
7948 destMat->removeEmptyProcessesInPlace (ReducedRowMap);
7949
7950 ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
7951 ReducedRowMap :
7952 MyDomainMap->replaceCommWithSubset (ReducedComm);
7953 ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
7954 ReducedRowMap :
7955 MyRangeMap->replaceCommWithSubset (ReducedComm);
7956
7957 // Reset the "my" maps
7958 MyRowMap = ReducedRowMap;
7959 MyDomainMap = ReducedDomainMap;
7960 MyRangeMap = ReducedRangeMap;
7961
7962 // Update my PID, if we've restricted the communicator
7963 if (! ReducedComm.is_null ()) {
7964 MyPID = ReducedComm->getRank ();
7965 }
7966 else {
7967 MyPID = -2; // For debugging
7968 }
7969 }
7970 else {
7971 ReducedComm = MyRowMap->getComm ();
7972 }
7973
7974
7975
7976 /***************************************************/
7977 /***** 2) From Tpera::DistObject::doTransfer() ****/
7978 /***************************************************/
7979 // Get the owning PIDs
7980 RCP<const import_type> MyImporter = getGraph ()->getImporter ();
7981
7982 // check whether domain maps of source matrix and base domain map is the same
7983 bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
7984
7985 if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
7986 // Same domain map as source matrix
7987 //
7988 // NOTE: This won't work for restrictComm (because the Import
7989 // doesn't know the restricted PIDs), though writing an
7990 // optimized version for that case would be easy (Import an
7991 // IntVector of the new PIDs). Might want to add this later.
7992 Import_Util::getPids (*MyImporter, SourcePids, false);
7993 }
7994 else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
7995 // Same domain map as source matrix (restricted communicator)
7996 // We need one import from the domain to the column map
7997 IntVectorType SourceDomain_pids(getDomainMap (),true);
7998 IntVectorType SourceCol_pids(getColMap());
7999 // SourceDomain_pids contains the restricted pids
8000 SourceDomain_pids.putScalar(MyPID);
8001
8002 SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8003 SourcePids.resize (getColMap ()->getLocalNumElements ());
8004 SourceCol_pids.get1dCopy (SourcePids ());
8005 }
8006 else if (MyImporter.is_null ()) {
8007 // Matrix has no off-process entries
8008 SourcePids.resize (getColMap ()->getLocalNumElements ());
8009 SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8010 }
8011 else if ( ! MyImporter.is_null () &&
8012 ! domainTransfer.is_null () ) {
8013 // general implementation for rectangular matrices with
8014 // domain map different than SourceMatrix domain map.
8015 // User has to provide a DomainTransfer object. We need
8016 // to communications (import/export)
8017
8018 // TargetDomain_pids lives on the rebalanced new domain map
8019 IntVectorType TargetDomain_pids (domainMap);
8020 TargetDomain_pids.putScalar (MyPID);
8021
8022 // SourceDomain_pids lives on the non-rebalanced old domain map
8023 IntVectorType SourceDomain_pids (getDomainMap ());
8024
8025 // SourceCol_pids lives on the non-rebalanced old column map
8026 IntVectorType SourceCol_pids (getColMap ());
8027
8028 if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8029 SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8030 }
8031 else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8032 SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8033 }
8034 else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8035 SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8036 }
8037 else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8038 SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8039 }
8040 else {
8041 TEUCHOS_TEST_FOR_EXCEPTION(
8042 true, std::logic_error, "Tpetra::CrsMatrix::"
8043 "transferAndFillComplete: Should never get here! "
8044 "Please report this bug to a Tpetra developer.");
8045 }
8046 SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8047 SourcePids.resize (getColMap ()->getLocalNumElements ());
8048 SourceCol_pids.get1dCopy (SourcePids ());
8049 }
8050 else if ( ! MyImporter.is_null () &&
8051 BaseDomainMap->isSameAs (*BaseRowMap) &&
8052 getDomainMap ()->isSameAs (*getRowMap ())) {
8053 // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8054
8055 IntVectorType TargetRow_pids (domainMap);
8056 IntVectorType SourceRow_pids (getRowMap ());
8057 IntVectorType SourceCol_pids (getColMap ());
8058
8059 TargetRow_pids.putScalar (MyPID);
8060 if (! reverseMode && xferAsImport != nullptr) {
8061 SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8062 }
8063 else if (reverseMode && xferAsExport != nullptr) {
8064 SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8065 }
8066 else if (! reverseMode && xferAsExport != nullptr) {
8067 SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8068 }
8069 else if (reverseMode && xferAsImport != nullptr) {
8070 SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8071 }
8072 else {
8073 TEUCHOS_TEST_FOR_EXCEPTION(
8074 true, std::logic_error, "Tpetra::CrsMatrix::"
8075 "transferAndFillComplete: Should never get here! "
8076 "Please report this bug to a Tpetra developer.");
8077 }
8078
8079 SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8080 SourcePids.resize (getColMap ()->getLocalNumElements ());
8081 SourceCol_pids.get1dCopy (SourcePids ());
8082 }
8083 else {
8084 TEUCHOS_TEST_FOR_EXCEPTION(
8085 true, std::invalid_argument, "Tpetra::CrsMatrix::"
8086 "transferAndFillComplete: This method only allows either domainMap == "
8087 "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8088 "getDomainMap () == getRowMap ()).");
8089 }
8090
8091 // Tpetra-specific stuff
8092 size_t constantNumPackets = destMat->constantNumberOfPackets ();
8093 if (constantNumPackets == 0) {
8094 destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8095 RemoteLIDs.size ());
8096 }
8097 else {
8098 // There are a constant number of packets per element. We
8099 // already know (from the number of "remote" (incoming)
8100 // elements) how many incoming elements we expect, so we can
8101 // resize the buffer accordingly.
8102 const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
8103 destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8104 }
8105
8106 // Pack & Prepare w/ owning PIDs
8107 if (debug) {
8108 using Teuchos::outArg;
8109 using Teuchos::REDUCE_MAX;
8110 using Teuchos::reduceAll;
8111 using std::cerr;
8112 using std::endl;
8113 RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8114 const int myRank = comm->getRank ();
8115
8116 std::ostringstream errStrm;
8117 int lclErr = 0;
8118 int gblErr = 0;
8119
8120 Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8121 try {
8122 // packAndPrepare* methods modify numExportPacketsPerLID_.
8123 destMat->numExportPacketsPerLID_.modify_host ();
8124 numExportPacketsPerLID =
8125 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8126 }
8127 catch (std::exception& e) {
8128 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8129 << e.what () << std::endl;
8130 lclErr = 1;
8131 }
8132 catch (...) {
8133 errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8134 "an exception not a subclass of std::exception" << std::endl;
8135 lclErr = 1;
8136 }
8137
8138 if (! comm.is_null ()) {
8139 reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8140 }
8141 if (gblErr != 0) {
8142 ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8143 TEUCHOS_TEST_FOR_EXCEPTION(
8144 true, std::runtime_error, "getArrayViewFromDualView threw an "
8145 "exception on at least one process.");
8146 }
8147
8148 if (verbose) {
8149 std::ostringstream os;
8150 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8151 << std::endl;
8152 std::cerr << os.str ();
8153 }
8154 try {
8156 destMat->exports_,
8157 numExportPacketsPerLID,
8158 ExportLIDs,
8159 SourcePids,
8160 constantNumPackets);
8161 }
8162 catch (std::exception& e) {
8163 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8164 << e.what () << std::endl;
8165 lclErr = 1;
8166 }
8167 catch (...) {
8168 errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8169 "an exception not a subclass of std::exception" << std::endl;
8170 lclErr = 1;
8171 }
8172
8173 if (verbose) {
8174 std::ostringstream os;
8175 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8176 << std::endl;
8177 std::cerr << os.str ();
8178 }
8179
8180 if (! comm.is_null ()) {
8181 reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8182 }
8183 if (gblErr != 0) {
8184 ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8185 TEUCHOS_TEST_FOR_EXCEPTION(
8186 true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8187 "exception on at least one process.");
8188 }
8189 }
8190 else {
8191 // packAndPrepare* methods modify numExportPacketsPerLID_.
8192 destMat->numExportPacketsPerLID_.modify_host ();
8193 Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8194 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8195 if (verbose) {
8196 std::ostringstream os;
8197 os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8198 << std::endl;
8199 std::cerr << os.str ();
8200 }
8202 destMat->exports_,
8203 numExportPacketsPerLID,
8204 ExportLIDs,
8205 SourcePids,
8206 constantNumPackets);
8207 if (verbose) {
8208 std::ostringstream os;
8209 os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8210 << std::endl;
8211 std::cerr << os.str ();
8212 }
8213 }
8214
8215 // Do the exchange of remote data.
8216 if (! communication_needed) {
8217 if (verbose) {
8218 std::ostringstream os;
8219 os << *verbosePrefix << "Communication not needed" << std::endl;
8220 std::cerr << os.str ();
8221 }
8222 }
8223 else {
8224 if (reverseMode) {
8225 if (constantNumPackets == 0) { // variable number of packets per LID
8226 if (verbose) {
8227 std::ostringstream os;
8228 os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8229 << std::endl;
8230 std::cerr << os.str ();
8231 }
8232 // Make sure that host has the latest version, since we're
8233 // using the version on host. If host has the latest
8234 // version, syncing to host does nothing.
8235 destMat->numExportPacketsPerLID_.sync_host ();
8236 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8237 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8238 destMat->numImportPacketsPerLID_.sync_host ();
8239 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8240 getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8241
8242 if (verbose) {
8243 std::ostringstream os;
8244 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8245 << std::endl;
8246 std::cerr << os.str ();
8247 }
8248 Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8249 destMat->numImportPacketsPerLID_.view_host());
8250 if (verbose) {
8251 std::ostringstream os;
8252 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8253 << std::endl;
8254 std::cerr << os.str ();
8255 }
8256
8257 size_t totalImportPackets = 0;
8258 for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8259 totalImportPackets += numImportPacketsPerLID[i];
8260 }
8261
8262 // Reallocation MUST go before setting the modified flag,
8263 // because it may clear out the flags.
8264 destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8265 verbosePrefix.get ());
8266 destMat->imports_.modify_host ();
8267 auto hostImports = destMat->imports_.view_host();
8268 // This is a legacy host pack/unpack path, so use the host
8269 // version of exports_.
8270 destMat->exports_.sync_host ();
8271 auto hostExports = destMat->exports_.view_host();
8272 if (verbose) {
8273 std::ostringstream os;
8274 os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8275 << std::endl;
8276 std::cerr << os.str ();
8277 }
8278 Distor.doReversePostsAndWaits (hostExports,
8279 numExportPacketsPerLID,
8280 hostImports,
8281 numImportPacketsPerLID);
8282 if (verbose) {
8283 std::ostringstream os;
8284 os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8285 << std::endl;
8286 std::cerr << os.str ();
8287 }
8288 }
8289 else { // constant number of packets per LID
8290 if (verbose) {
8291 std::ostringstream os;
8292 os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8293 << std::endl;
8294 std::cerr << os.str ();
8295 }
8296 destMat->imports_.modify_host ();
8297 auto hostImports = destMat->imports_.view_host();
8298 // This is a legacy host pack/unpack path, so use the host
8299 // version of exports_.
8300 destMat->exports_.sync_host ();
8301 auto hostExports = destMat->exports_.view_host();
8302 if (verbose) {
8303 std::ostringstream os;
8304 os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8305 << std::endl;
8306 std::cerr << os.str ();
8307 }
8308 Distor.doReversePostsAndWaits (hostExports,
8309 constantNumPackets,
8310 hostImports);
8311 if (verbose) {
8312 std::ostringstream os;
8313 os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8314 << std::endl;
8315 std::cerr << os.str ();
8316 }
8317 }
8318 }
8319 else { // forward mode (the default)
8320 if (constantNumPackets == 0) { // variable number of packets per LID
8321 if (verbose) {
8322 std::ostringstream os;
8323 os << *verbosePrefix << "Forward mode, variable # packets / LID"
8324 << std::endl;
8325 std::cerr << os.str ();
8326 }
8327 // Make sure that host has the latest version, since we're
8328 // using the version on host. If host has the latest
8329 // version, syncing to host does nothing.
8330 destMat->numExportPacketsPerLID_.sync_host ();
8331 Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8332 getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8333 destMat->numImportPacketsPerLID_.sync_host ();
8334 Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8335 getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8336 if (verbose) {
8337 std::ostringstream os;
8338 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8339 << std::endl;
8340 std::cerr << os.str ();
8341 }
8342 Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8343 destMat->numImportPacketsPerLID_.view_host());
8344 if (verbose) {
8345 std::ostringstream os;
8346 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8347 << std::endl;
8348 std::cerr << os.str ();
8349 }
8350
8351 size_t totalImportPackets = 0;
8352 for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8353 totalImportPackets += numImportPacketsPerLID[i];
8354 }
8355
8356 // Reallocation MUST go before setting the modified flag,
8357 // because it may clear out the flags.
8358 destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8359 verbosePrefix.get ());
8360 destMat->imports_.modify_host ();
8361 auto hostImports = destMat->imports_.view_host();
8362 // This is a legacy host pack/unpack path, so use the host
8363 // version of exports_.
8364 destMat->exports_.sync_host ();
8365 auto hostExports = destMat->exports_.view_host();
8366 if (verbose) {
8367 std::ostringstream os;
8368 os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8369 << std::endl;
8370 std::cerr << os.str ();
8371 }
8372 Distor.doPostsAndWaits (hostExports,
8373 numExportPacketsPerLID,
8374 hostImports,
8375 numImportPacketsPerLID);
8376 if (verbose) {
8377 std::ostringstream os;
8378 os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8379 << std::endl;
8380 std::cerr << os.str ();
8381 }
8382 }
8383 else { // constant number of packets per LID
8384 if (verbose) {
8385 std::ostringstream os;
8386 os << *verbosePrefix << "Forward mode, constant # packets / LID"
8387 << std::endl;
8388 std::cerr << os.str ();
8389 }
8390 destMat->imports_.modify_host ();
8391 auto hostImports = destMat->imports_.view_host();
8392 // This is a legacy host pack/unpack path, so use the host
8393 // version of exports_.
8394 destMat->exports_.sync_host ();
8395 auto hostExports = destMat->exports_.view_host();
8396 if (verbose) {
8397 std::ostringstream os;
8398 os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8399 << std::endl;
8400 std::cerr << os.str ();
8401 }
8402 Distor.doPostsAndWaits (hostExports,
8403 constantNumPackets,
8404 hostImports);
8405 if (verbose) {
8406 std::ostringstream os;
8407 os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8408 << std::endl;
8409 std::cerr << os.str ();
8410 }
8411 }
8412 }
8413 }
8414
8415 /*********************************************************************/
8416 /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8417 /*********************************************************************/
8418
8419 // Backwards compatibility measure. We'll use this again below.
8420 destMat->numImportPacketsPerLID_.sync_host ();
8421 Teuchos::ArrayView<const size_t> numImportPacketsPerLID =
8422 getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8423 destMat->imports_.sync_host ();
8424 Teuchos::ArrayView<const char> hostImports =
8425 getArrayViewFromDualView (destMat->imports_);
8426
8427 if (verbose) {
8428 std::ostringstream os;
8429 os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount"
8430 << std::endl;
8431 std::cerr << os.str ();
8432 }
8433 size_t mynnz =
8435 RemoteLIDs,
8436 hostImports,
8437 numImportPacketsPerLID,
8438 constantNumPackets,
8439 INSERT,
8440 NumSameIDs,
8441 PermuteToLIDs,
8442 PermuteFromLIDs);
8443 if (verbose) {
8444 std::ostringstream os;
8445 os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned "
8446 << mynnz << std::endl;
8447 std::cerr << os.str ();
8448 }
8449 size_t N = BaseRowMap->getLocalNumElements ();
8450
8451 // Allocations
8452 ArrayRCP<size_t> CSR_rowptr(N+1);
8453 ArrayRCP<GO> CSR_colind_GID;
8454 ArrayRCP<LO> CSR_colind_LID;
8455 ArrayRCP<Scalar> CSR_vals;
8456 CSR_colind_GID.resize (mynnz);
8457 CSR_vals.resize (mynnz);
8458
8459 // If LO and GO are the same, we can reuse memory when
8460 // converting the column indices from global to local indices.
8461 if (typeid (LO) == typeid (GO)) {
8462 CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8463 }
8464 else {
8465 CSR_colind_LID.resize (mynnz);
8466 }
8467
8468 if (verbose) {
8469 std::ostringstream os;
8470 os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays"
8471 << std::endl;
8472 std::cerr << os.str ();
8473 }
8474 // FIXME (mfh 15 May 2014) Why can't we abstract this out as an
8475 // unpackAndCombine method on a "CrsArrays" object? This passing
8476 // in a huge list of arrays is icky. Can't we have a bit of an
8477 // abstraction? Implementing a concrete DistObject subclass only
8478 // takes five methods.
8480 RemoteLIDs,
8481 hostImports,
8482 numImportPacketsPerLID,
8483 constantNumPackets,
8484 INSERT,
8485 NumSameIDs,
8486 PermuteToLIDs,
8487 PermuteFromLIDs,
8488 N,
8489 mynnz,
8490 MyPID,
8491 CSR_rowptr (),
8492 CSR_colind_GID (),
8493 Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
8494 SourcePids (),
8495 TargetPids);
8496
8497 // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8498 // owned entries. Convert them to the actual PID.
8499 for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8500 {
8501 if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8502 }
8503 /**************************************************************/
8504 /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8505 /**************************************************************/
8506 // Call an optimized version of makeColMap that avoids the
8507 // Directory lookups (since the Import object knows who owns all
8508 // the GIDs).
8509 Teuchos::Array<int> RemotePids;
8510 if (verbose) {
8511 std::ostringstream os;
8512 os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8513 << std::endl;
8514 std::cerr << os.str ();
8515 }
8516 Import_Util::lowCommunicationMakeColMapAndReindex (CSR_rowptr (),
8517 CSR_colind_LID (),
8518 CSR_colind_GID (),
8519 BaseDomainMap,
8520 TargetPids,
8521 RemotePids,
8522 MyColMap);
8523
8524 if (verbose) {
8525 std::ostringstream os;
8526 os << *verbosePrefix << "restrictComm="
8527 << (restrictComm ? "true" : "false") << std::endl;
8528 std::cerr << os.str ();
8529 }
8530
8531 /*******************************************************/
8532 /**** 4) Second communicator restriction phase ****/
8533 /*******************************************************/
8534 if (restrictComm) {
8535 ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8536 ReducedRowMap :
8537 MyColMap->replaceCommWithSubset (ReducedComm);
8538 MyColMap = ReducedColMap; // Reset the "my" maps
8539 }
8540
8541 // Replace the col map
8542 if (verbose) {
8543 std::ostringstream os;
8544 os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8545 std::cerr << os.str ();
8546 }
8547 destMat->replaceColMap (MyColMap);
8548
8549 // Short circuit if the processor is no longer in the communicator
8550 //
8551 // NOTE: Epetra replaces modifies all "removed" processes so they
8552 // have a dummy (serial) Map that doesn't touch the original
8553 // communicator. Duplicating that here might be a good idea.
8554 if (ReducedComm.is_null ()) {
8555 if (verbose) {
8556 std::ostringstream os;
8557 os << *verbosePrefix << "I am no longer in the communicator; "
8558 "returning" << std::endl;
8559 std::cerr << os.str ();
8560 }
8561 return;
8562 }
8563
8564 /***************************************************/
8565 /**** 5) Sort ****/
8566 /***************************************************/
8567 if ((! reverseMode && xferAsImport != nullptr) ||
8568 (reverseMode && xferAsExport != nullptr)) {
8569 if (verbose) {
8570 std::ostringstream os;
8571 os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8572 std::cerr << os.str ();
8573 }
8574 Import_Util::sortCrsEntries (CSR_rowptr (),
8575 CSR_colind_LID (),
8576 CSR_vals ());
8577 }
8578 else if ((! reverseMode && xferAsExport != nullptr) ||
8579 (reverseMode && xferAsImport != nullptr)) {
8580 if (verbose) {
8581 std::ostringstream os;
8582 os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8583 << endl;
8584 std::cerr << os.str();
8585 }
8586 Import_Util::sortAndMergeCrsEntries (CSR_rowptr (),
8587 CSR_colind_LID (),
8588 CSR_vals ());
8589 if (CSR_rowptr[N] != mynnz) {
8590 CSR_colind_LID.resize (CSR_rowptr[N]);
8591 CSR_vals.resize (CSR_rowptr[N]);
8592 }
8593 }
8594 else {
8595 TEUCHOS_TEST_FOR_EXCEPTION(
8596 true, std::logic_error, "Tpetra::CrsMatrix::"
8597 "transferAndFillComplete: Should never get here! "
8598 "Please report this bug to a Tpetra developer.");
8599 }
8600 /***************************************************/
8601 /**** 6) Reset the colmap and the arrays ****/
8602 /***************************************************/
8603
8604 if (verbose) {
8605 std::ostringstream os;
8606 os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8607 std::cerr << os.str ();
8608 }
8609
8610 // Call constructor for the new matrix (restricted as needed)
8611 //
8612 // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8613 // refactor version of CrsMatrix, though it reserves the right to
8614 // make a deep copy of the arrays.
8615 destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8616
8617 /***************************************************/
8618 /**** 7) Build Importer & Call ESFC ****/
8619 /***************************************************/
8620 // Pre-build the importer using the existing PIDs
8621 Teuchos::ParameterList esfc_params;
8622
8623 RCP<import_type> MyImport;
8624
8625 // Fulfull the non-blocking allreduce on reduced_mismatch.
8626 if (iallreduceRequest.get () != nullptr) {
8627 if (verbose) {
8628 std::ostringstream os;
8629 os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8630 << endl;
8631 std::cerr << os.str ();
8632 }
8633 iallreduceRequest->wait ();
8634 if (reduced_mismatch != 0) {
8635 isMM = false;
8636 }
8637 }
8638
8639 if( isMM ) {
8640#ifdef HAVE_TPETRA_MMM_TIMINGS
8641 Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8642#endif
8643 // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8644
8645 if (verbose) {
8646 std::ostringstream os;
8647 os << *verbosePrefix << "Getting CRS pointers" << endl;
8648 std::cerr << os.str ();
8649 }
8650
8651 Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8652 Teuchos::ArrayRCP<int> type3PIDs;
8653 auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8654 auto colind = getCrsGraph()->getLocalIndicesHost();
8655
8656 if (verbose) {
8657 std::ostringstream os;
8658 os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8659 std::cerr << os.str ();
8660 }
8661
8662 {
8663#ifdef HAVE_TPETRA_MMM_TIMINGS
8664 TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8665#endif
8666 Import_Util::reverseNeighborDiscovery(*this,
8667 rowptr,
8668 colind,
8669 rowTransfer,
8670 MyImporter,
8671 MyDomainMap,
8672 type3PIDs,
8673 type3LIDs,
8674 ReducedComm);
8675 }
8676
8677 if (verbose) {
8678 std::ostringstream os;
8679 os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8680 std::cerr << os.str ();
8681 }
8682
8683 Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8684 Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8685
8686 Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8687 Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8688
8689 const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8690 // from EpetraExt_MMHelpers.cpp: build_type2_exports
8691 std::vector<bool> IsOwned(numCols,true);
8692 std::vector<int> SentTo(numCols,-1);
8693 if (! MyImporter.is_null ()) {
8694 for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8695 IsOwned[rlid]=false;
8696 }
8697 }
8698
8699 std::vector<std::pair<int,GO> > usrtg;
8700 usrtg.reserve(TEPID2.size());
8701
8702 {
8703 const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8704 for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8705 const LO row = TELID2[i];
8706 const int pid = TEPID2[i];
8707 for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8708 const int col = colind[j];
8709 if (IsOwned[col] && SentTo[col] != pid) {
8710 SentTo[col] = pid;
8711 GO gid = colMap.getGlobalElement (col);
8712 usrtg.push_back (std::pair<int,GO> (pid, gid));
8713 }
8714 }
8715 }
8716 }
8717
8718// This sort can _not_ be omitted.[
8719 std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
8720 auto eopg = std ::unique(usrtg.begin(),usrtg.end());
8721 // 25 Jul 2018: Could just ignore the entries at and after eopg.
8722 usrtg.erase(eopg,usrtg.end());
8723
8724 const Array_size_type type2_us_size = usrtg.size();
8725 Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
8726 Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
8727
8728 int pos=0;
8729 for(auto && p : usrtg) {
8730 EPID2[pos]= p.first;
8731 ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
8732 pos++;
8733 }
8734
8735 Teuchos::ArrayView<int> EPID3 = type3PIDs();
8736 Teuchos::ArrayView< LO> ELID3 = type3LIDs();
8737 GO InfGID = std::numeric_limits<GO>::max();
8738 int InfPID = INT_MAX;
8739#ifdef TPETRA_MIN3
8740# undef TPETRA_MIN3
8741#endif // TPETRA_MIN3
8742#define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
8743 int i1=0, i2=0, i3=0;
8744 int Len1 = EPID1.size();
8745 int Len2 = EPID2.size();
8746 int Len3 = EPID3.size();
8747
8748 int MyLen=Len1+Len2+Len3;
8749 Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
8750 Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
8751 int iloc = 0; // will be the size of the userExportLID/PIDs
8752
8753 while(i1 < Len1 || i2 < Len2 || i3 < Len3){
8754 int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
8755 int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
8756 int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
8757
8758 GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
8759 GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
8760 GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
8761
8762 int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
8763 GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
8764#ifdef TPETRA_MIN3
8765# undef TPETRA_MIN3
8766#endif // TPETRA_MIN3
8767 bool added_entry=false;
8768
8769 if(PID1 == MIN_PID && GID1 == MIN_GID){
8770 userExportLIDs[iloc]=ELID1[i1];
8771 userExportPIDs[iloc]=EPID1[i1];
8772 i1++;
8773 added_entry=true;
8774 iloc++;
8775 }
8776 if(PID2 == MIN_PID && GID2 == MIN_GID){
8777 if(!added_entry) {
8778 userExportLIDs[iloc]=ELID2[i2];
8779 userExportPIDs[iloc]=EPID2[i2];
8780 added_entry=true;
8781 iloc++;
8782 }
8783 i2++;
8784 }
8785 if(PID3 == MIN_PID && GID3 == MIN_GID){
8786 if(!added_entry) {
8787 userExportLIDs[iloc]=ELID3[i3];
8788 userExportPIDs[iloc]=EPID3[i3];
8789 iloc++;
8790 }
8791 i3++;
8792 }
8793 }
8794
8795 if (verbose) {
8796 std::ostringstream os;
8797 os << *verbosePrefix << "Create Import" << std::endl;
8798 std::cerr << os.str ();
8799 }
8800
8801#ifdef HAVE_TPETRA_MMM_TIMINGS
8802 auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
8803#endif
8804 Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
8805 // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
8806 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8807 MyImport = rcp ( new import_type (MyDomainMap,
8808 MyColMap,
8809 RemotePids,
8810 userExportLIDs.view(0,iloc).getConst(),
8811 userExportPIDs.view(0,iloc).getConst(),
8812 plist)
8813 );
8814
8815 if (verbose) {
8816 std::ostringstream os;
8817 os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
8818 std::cerr << os.str ();
8819 }
8820
8821 {
8822#ifdef HAVE_TPETRA_MMM_TIMINGS
8823 TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
8824 esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
8825#endif
8826 if(!params.is_null())
8827 esfc_params.set("compute global constants",params->get("compute global constants",true));
8828 destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
8829
8830 }
8831
8832 } // if(isMM)
8833 else {
8834#ifdef HAVE_TPETRA_MMM_TIMINGS
8835 TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
8836#endif
8837 if (verbose) {
8838 std::ostringstream os;
8839 os << *verbosePrefix << "Create Import" << std::endl;
8840 std::cerr << os.str ();
8841 }
8842
8843#ifdef HAVE_TPETRA_MMM_TIMINGS
8844 TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
8845#endif
8846 Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
8847 mypars->set("Timer Label","notMMFrom_tAFC");
8848 if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
8849 MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
8850
8851 if (verbose) {
8852 std::ostringstream os;
8853 os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
8854 std::cerr << os.str ();
8855 }
8856
8857#ifdef HAVE_TPETRA_MMM_TIMINGS
8858 TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
8859 esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
8860#else
8861 esfc_params.set("Timer Label",std::string("notMM eSFC"));
8862#endif
8863
8864 if (!params.is_null ()) {
8865 esfc_params.set ("compute global constants",
8866 params->get ("compute global constants", true));
8867 }
8868 destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
8869 MyImport, Teuchos::null,
8870 rcp (new Teuchos::ParameterList (esfc_params)));
8871 }
8872
8873 if (verbose) {
8874 std::ostringstream os;
8875 os << *verbosePrefix << "Done" << endl;
8876 std::cerr << os.str ();
8877 }
8878 }
8879
8880
8881 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8882 void
8885 const import_type& importer,
8886 const Teuchos::RCP<const map_type>& domainMap,
8887 const Teuchos::RCP<const map_type>& rangeMap,
8888 const Teuchos::RCP<Teuchos::ParameterList>& params) const
8889 {
8890 transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
8891 }
8892
8893 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8894 void
8897 const import_type& rowImporter,
8898 const import_type& domainImporter,
8899 const Teuchos::RCP<const map_type>& domainMap,
8900 const Teuchos::RCP<const map_type>& rangeMap,
8901 const Teuchos::RCP<Teuchos::ParameterList>& params) const
8902 {
8903 transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
8904 }
8905
8906 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8907 void
8910 const export_type& exporter,
8911 const Teuchos::RCP<const map_type>& domainMap,
8912 const Teuchos::RCP<const map_type>& rangeMap,
8913 const Teuchos::RCP<Teuchos::ParameterList>& params) const
8914 {
8915 transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
8916 }
8917
8918 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8919 void
8922 const export_type& rowExporter,
8923 const export_type& domainExporter,
8924 const Teuchos::RCP<const map_type>& domainMap,
8925 const Teuchos::RCP<const map_type>& rangeMap,
8926 const Teuchos::RCP<Teuchos::ParameterList>& params) const
8927 {
8928 transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
8929 }
8930
8931} // namespace Tpetra
8932
8933//
8934// Explicit instantiation macro
8935//
8936// Must be expanded from within the Tpetra namespace!
8937//
8938
8939#define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
8940 \
8941 template class CrsMatrix< SCALAR , LO , GO , NODE >;
8942
8943#define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
8944 \
8945 template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
8946 CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
8947
8948#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8949 template<> \
8950 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
8951 importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
8952 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8953 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8954 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
8955 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8956 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8957 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
8958 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8959 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8960 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
8961 const Teuchos::RCP<Teuchos::ParameterList>& params);
8962
8963#define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8964 template<> \
8965 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
8966 importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
8967 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8968 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8969 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
8970 const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8971 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8972 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
8973 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8974 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8975 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
8976 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8977 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8978 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
8979 const Teuchos::RCP<Teuchos::ParameterList>& params);
8980
8981
8982#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
8983 template<> \
8984 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
8985 exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
8986 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8987 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8988 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
8989 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8990 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8991 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
8992 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
8993 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
8994 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
8995 const Teuchos::RCP<Teuchos::ParameterList>& params);
8996
8997#define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
8998 template<> \
8999 Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9000 exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9001 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9002 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9003 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9004 const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9005 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9006 CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9007 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9008 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9009 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9010 const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9011 CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9012 CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9013 const Teuchos::RCP<Teuchos::ParameterList>& params);
9014
9015
9016#define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9017 TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9018 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9019 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9020 TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9021 TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9022
9023#endif // TPETRA_CRSMATRIX_DEF_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular,...
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular,...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration of a function that prints strings from each process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::iallreduce.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
KOKKOS_FUNCTION size_t packRow(const LocalMapType &col_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const InputLidsType &lids_in, const InputPidsType &pids_in, const size_t offset, const size_t num_ent, const bool pack_pids)
Packs a single row of the CrsGraph.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
bool noRedundancies_
Whether the graph's indices are non-redundant (merged) in each row, on this process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
bool indicesAreSorted_
Whether the graph's indices are sorted in each row, on this process.
local_inds_dualv_type::t_host getLocalIndsViewHostNonConst(const RowInfo &rowinfo)
Get a ReadWrite locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(m...
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
size_t getLocalNumRows() const override
Returns the number of graph rows owned on the calling node.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
Details::EStorageStatus storageStatus_
Status of the matrix's storage, when not in a fill-complete state.
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix's communicator.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
typename Node::device_type device_type
The Kokkos device type.
bool fillComplete_
Whether the matrix is fill complete.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void checkInternalState() const
Check that this object's state is sane; throw if it's not.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Scalar scalar_type
The type of each entry in the matrix.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra's behavior.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
bool isLocallyComplete() const
Is this Export or Import locally complete?
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
local_map_type getLocalMap() const
Get the local Map for Kokkos kernels.
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y....
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X....
A read-only, row-oriented interface to a sparse matrix.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row's entries.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
Implementation details of Tpetra.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified,...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Functor for the the ABSMAX CombineMode of Import and Export operations.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
Traits class for packing / unpacking data of type T.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.