Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1// @HEADER
2// ************************************************************************
3//
4// Intrepid2 Package
5// Copyright (2007) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Kyungjoo Kim (kyukim@sandia.gov),
38// Mauro Perego (mperego@sandia.gov), or
39// Nate Roberts (nvrober@sandia.gov)
40//
41// ************************************************************************
42// @HEADER
43
49#ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50#define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
51
54
55#include "Teuchos_TimeMonitor.hpp"
56
57namespace Intrepid2 {
58
59 namespace Impl
60 {
64 template<class Scalar, class DeviceType, int integralViewRank>
66 {
67 using ExecutionSpace = typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember = typename TeamPolicy::member_type;
70
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
73 TensorData<Scalar,DeviceType> leftComponent_;
74 Data<Scalar,DeviceType> composedTransform_;
75 TensorData<Scalar,DeviceType> rightComponent_;
77 int a_offset_;
78 int b_offset_;
79 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
80 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
85
86 size_t fad_size_output_ = 0; // 0 if not a fad type
87
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
89
90 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
91 // (this also makes it easier to reorder loops, etc., for further optimizations)
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
95
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
98
99 int maxFieldsLeft_;
100 int maxFieldsRight_;
101 int maxPointCount_;
102 public:
104 TensorData<Scalar,DeviceType> leftComponent,
105 Data<Scalar,DeviceType> composedTransform,
106 TensorData<Scalar,DeviceType> rightComponent,
108 int a_offset,
109 int b_offset,
110 int leftFieldOrdinalOffset,
111 int rightFieldOrdinalOffset,
112 bool forceNonSpecialized)
113 :
114 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115 leftComponent_(leftComponent),
116 composedTransform_(composedTransform),
117 rightComponent_(rightComponent),
118 cellMeasures_(cellMeasures),
119 a_offset_(a_offset),
120 b_offset_(b_offset),
121 leftComponentSpan_(leftComponent.extent_int(2)),
122 rightComponentSpan_(rightComponent.extent_int(2)),
123 numTensorComponents_(leftComponent.numTensorComponents()),
124 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126 forceNonSpecialized_(forceNonSpecialized)
127 {
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
129
130 // set up bounds containers
131 const int FIELD_DIM = 0;
132 const int POINT_DIM = 1;
133 maxFieldsLeft_ = 0;
134 maxFieldsRight_ = 0;
135 maxPointCount_ = 0;
136 for (int r=0; r<numTensorComponents_; r++)
137 {
138 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
144 }
145
146 // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
147 int leftRelativeEnumerationSpan = 1;
148 int rightRelativeEnumerationSpan = 1;
149 for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
150 {
151 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
155 }
156
157 // prepare for allocation of temporary storage
158 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
159
160 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161 if (allocateFadStorage)
162 {
163 fad_size_output_ = dimension_scalar(integralView_);
164 }
165
166 const int R = numTensorComponents_ - 1;
167
168 int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
169 int allocationSoFar = 0;
170 offsetsForComponentOrdinal_[R] = allocationSoFar;
171 allocationSoFar++; // we store one entry corresponding to R, the final component
172
173 for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
174 {
175 const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
176 const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
177
178 num_ij *= leftFields * rightFields;
179
180 offsetsForComponentOrdinal_[r] = allocationSoFar;
181 allocationSoFar += num_ij;
182 }
183 offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
184 }
185
186 template<size_t maxComponents, size_t numComponents = maxComponents>
187 KOKKOS_INLINE_FUNCTION
188 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189 const Kokkos::Array<int,maxComponents> &bounds) const
190 {
191 if (numComponents == 0)
192 {
193 return -1;
194 }
195 else
196 {
197 int r = static_cast<int>(numComponents - 1);
198 while (arguments[r] + 1 >= bounds[r])
199 {
200 arguments[r] = 0; // reset
201 r--;
202 if (r < 0) break;
203 }
204 if (r >= 0) ++arguments[r];
205 return r;
206 }
207 }
208
210 KOKKOS_INLINE_FUNCTION
211 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213 const int &numComponents) const
214 {
215 if (numComponents == 0) return -1;
216 int r = static_cast<int>(numComponents - 1);
217 while (arguments[r] + 1 >= bounds[r])
218 {
219 arguments[r] = 0; // reset
220 r--;
221 if (r < 0) break;
222 }
223 if (r >= 0) ++arguments[r];
224 return r;
225 }
226
227 template<size_t maxComponents, size_t numComponents = maxComponents>
228 KOKKOS_INLINE_FUNCTION
229 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
230 const Kokkos::Array<int,maxComponents> &bounds) const
231 {
232 if (numComponents == 0)
233 {
234 return -1;
235 }
236 else
237 {
238 int r = static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
240 {
241 r--;
242 if (r < 0) break;
243 }
244 return r;
245 }
246 }
247
249 KOKKOS_INLINE_FUNCTION
250 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252 const int &numComponents) const
253 {
254 if (numComponents == 0) return -1;
255 int r = numComponents - 1;
256 while (arguments[r] + 1 >= bounds[r])
257 {
258 r--;
259 if (r < 0) break;
260 }
261 return r;
262 }
263
264 template<size_t maxComponents, size_t numComponents = maxComponents>
265 KOKKOS_INLINE_FUNCTION
266 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
267 const Kokkos::Array<int,maxComponents> &bounds,
268 const int startIndex) const
269 {
270 // the following mirrors what is done in TensorData
271 if (numComponents == 0)
272 {
273 return 0;
274 }
275 else
276 {
277 int enumerationIndex = 0;
278 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
279 {
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
282 }
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
285 }
286 }
287
289
290 // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
291// template<size_t numTensorComponents>
292// KOKKOS_INLINE_FUNCTION
293// void runSpecialized( const TeamMember & teamMember ) const;
294
295// template<>
296// KOKKOS_INLINE_FUNCTION
297// void runSpecialized<3>( const TeamMember & teamMember ) const
298 KOKKOS_INLINE_FUNCTION
299 void runSpecialized3( const TeamMember & teamMember ) const
300 {
301 constexpr int numTensorComponents = 3;
302
303 Kokkos::Array<int,numTensorComponents> pointBounds;
304 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306 for (unsigned r=0; r<numTensorComponents; r++)
307 {
308 pointBounds[r] = pointBounds_[r];
309 leftFieldBounds[r] = leftFieldBounds_[r];
310 rightFieldBounds[r] = rightFieldBounds_[r];
311 }
312
313 const int cellDataOrdinal = teamMember.league_rank();
314 const int numThreads = teamMember.team_size(); // num threads
315 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
316
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
319 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
320 if (fad_size_output_ > 0) {
321 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
323 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
329 }
330 else {
331 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
333 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
339 }
340
341// int approximateFlopCount = 0;
342// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
343
344 constexpr int R = numTensorComponents - 1;
345
346 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
347 {
348 const int a = a_offset_ + a_component;
349 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
350 {
351 const int b = b_offset_ + b_component;
352
353 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
354 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
355
356 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
357 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
358
359 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
360 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
361 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
362 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
363 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
364 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
365
366 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
367 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
368 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
369 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
370 if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
371 {
372 const int leftRank = leftTensorComponent_x.rank();
373 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
374 }
375 if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
376 {
377 const int leftRank = leftTensorComponent_y.rank();
378 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
379 }
380 if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
381 {
382 const int leftRank = leftTensorComponent_z.rank();
383 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
384 }
385 if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
386 {
387 const int rightRank = rightTensorComponent_x.rank();
388 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
389 }
390 if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
391 {
392 const int rightRank = rightTensorComponent_y.rank();
393 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
394 }
395 if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
396 {
397 const int rightRank = rightTensorComponent_z.rank();
398 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
399 }
400 });
401
402 if (composedTransform_.underlyingMatchesLogical())
403 {
404 const auto & composedTransformView = composedTransform_.getUnderlyingView4();
405 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
406 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407 });
408 }
409 else
410 {
411 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
412 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
413 });
414 }
415
416 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
417
418 // synchronize threads
419 teamMember.team_barrier();
420
421 // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
422 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
423 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
424 {
425 scratchView(i) = 0.0;
426 }
427
428 // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
429 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
430 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
431 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
432
433 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
434 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
435 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
436 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
437 rightFieldArguments[R] = jz;
438 leftFieldArguments[R] = iz;
439
440 Kokkos::Array<int,numTensorComponents> pointArguments;
441 for (int i=0; i<numTensorComponents; i++)
442 {
443 pointArguments[i] = 0;
444 }
445
446 for (int lx=0; lx<pointBounds[0]; lx++)
447 {
448 pointArguments[0] = lx;
449
450 // clear Gy scratch:
451 // in scratch, Gz (1 entry) comes first, then Gy entries.
452 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
453 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
454
455 for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
456 {
457 scratchView(Gy_index) = 0;
458 }
459
460 for (int ly=0; ly<pointBounds[1]; ly++)
461 {
462 pointArguments[1] = ly;
463
464 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
465 *Gz = 0;
466
467 for (int lz=0; lz < pointBounds[2]; lz++)
468 {
469 const Scalar & leftValue = leftFields_z(iz,lz);
470 const Scalar & rightValue = rightFields_z(jz,lz);
471
472 pointArguments[2] = lz;
473 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
474
475 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
476
477 // approximateFlopCount += 3; // 2 multiplies, 1 sum
478 } // lz
479
480 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
481 {
482 leftFieldArguments[1] = iy;
483 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
484
485 const Scalar & leftValue = leftFields_y(iy,ly);
486
487 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
488 {
489 rightFieldArguments[1] = jy;
490
491 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
492 const Scalar & rightValue = rightFields_y(jy,ly);
493
494 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
495 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
496
497 const int Gz_index = 0;
498 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
499
500 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
501
502 Gy += leftValue * Gz * rightValue;
503 // approximateFlopCount += 3; // 2 multiplies, 1 sum
504 }
505 }
506 } // ly
507 for (int ix=0; ix<leftFieldBounds_[0]; ix++)
508 {
509 leftFieldArguments[0] = ix;
510 const Scalar & leftValue = leftFields_x(ix,lx);
511
512 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
513 {
514 leftFieldArguments[1] = iy;
515
516 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
517
518 for (int jx=0; jx<rightFieldBounds_[0]; jx++)
519 {
520 rightFieldArguments[0] = jx;
521 const Scalar & rightValue = rightFields_x(jx,lx);
522
523 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
524 {
525 rightFieldArguments[1] = jy;
526 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
527
528 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
529
530 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
531 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
532
533 // compute enumeration indices to get field indices into output view
534 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
535 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
536 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
537 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
538
539 if (integralViewRank == 3)
540 {
541// if ((leftFieldIndex==0) && (rightFieldIndex==2))
542// {
543// using std::cout;
544// using std::endl;
545// cout << "***** Contribution to (0,0,2) *****\n";
546// cout << "lx = " << lx << endl;
547// cout << "ix = " << ix << endl;
548// cout << "iy = " << iy << endl;
549// cout << "jx = " << jx << endl;
550// cout << "jy = " << jy << endl;
551// cout << "iz = " << iz << endl;
552// cout << "jz = " << jz << endl;
553// cout << " leftValue = " << leftValue << endl;
554// cout << "rightValue = " << rightValue << endl;
555// cout << "Gy = " << Gy << endl;
556//
557// cout << endl;
558// }
559
560 // shape (C,F1,F2)
561 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
562 }
563 else
564 {
565 // shape (F1,F2)
566 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
567 }
568 // approximateFlopCount += 3; // 2 multiplies, 1 sum
569 } // jy
570 } // ix
571 } // iy
572 } // ix
573 } // lx
574 }); // TeamThreadRange parallel_for - (iz,jz) loop
575 }
576 }
577// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
578 }
579
580 template<size_t numTensorComponents>
581 KOKKOS_INLINE_FUNCTION
582 void run( const TeamMember & teamMember ) const
583 {
584 Kokkos::Array<int,numTensorComponents> pointBounds;
585 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
586 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
587 for (unsigned r=0; r<numTensorComponents; r++)
588 {
589 pointBounds[r] = pointBounds_[r];
590 leftFieldBounds[r] = leftFieldBounds_[r];
591 rightFieldBounds[r] = rightFieldBounds_[r];
592 }
593
594 const int cellDataOrdinal = teamMember.league_rank();
595 const int numThreads = teamMember.team_size(); // num threads
596 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
597 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
598 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
599 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
600 if (fad_size_output_ > 0) {
601 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
603 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
605 }
606 else {
607 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
609 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
611 }
612
613// int approximateFlopCount = 0;
614// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
615
616 constexpr int R = numTensorComponents - 1;
617
618 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
619 {
620 const int a = a_offset_ + a_component;
621 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
622 {
623 const int b = b_offset_ + b_component;
624
625 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
626 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
627
628 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
629 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
630
631 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
632 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
633 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
634 const int leftFieldCount = leftTensorComponent.extent_int(0);
635 const int pointCount = leftTensorComponent.extent_int(1);
636 const int leftRank = leftTensorComponent.rank();
637 const int rightFieldCount = rightTensorComponent.extent_int(0);
638 const int rightRank = rightTensorComponent.rank();
639 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
640 {
641 // slightly weird logic here in effort to avoid branch divergence
642 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644 {
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
647 }
648 }
649 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
650 {
651 // slightly weird logic here in effort to avoid branch divergence
652 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
654 {
655 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
657 }
658 }
659 });
660
661 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
662 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
663 });
664 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
665
666 // synchronize threads
667 teamMember.team_barrier();
668
669 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
670 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
673
674 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
675 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
676 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678 rightFieldArguments[R] = j_R;
679 leftFieldArguments[R] = i_R;
680
681 //TODO: I believe that this can be moved outside the thread parallel_for
682 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683 {
684 scratchView(i) = 0.0;
685 }
686 Kokkos::Array<int,numTensorComponents> pointArguments;
687 for (unsigned i=0; i<numTensorComponents; i++)
688 {
689 pointArguments[i] = 0;
690 }
691
692 int r = R;
693 while (r >= 0)
694 {
695 // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
696 const int pointBounds_R = pointBounds[R];
697 int & pointArgument_R = pointArguments[R];
698 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
699 {
700 Scalar * G_R;
701 if (R != 0)
702 {
703 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
704 }
705 else
706 {
707 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709
710 if (integralViewRank == 3)
711 {
712 // shape (C,F1,F2)
713 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
714 }
715 else
716 {
717 // shape (F1,F2)
718 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
719 }
720 }
721
722 const int & pointIndex_R = pointArguments[R];
723
724 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726
727 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728
729 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
730
731// approximateFlopCount += 3; // 2 multiplies, 1 sum
732 } // pointArgument_R
733
734 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
735 const int r_min = (r_next >= 0) ? r_next : 0;
736
737 for (int s=R-1; s>=r_min; s--)
738 {
739 const int & pointIndex_s = pointArguments[s];
740
741 // want to cover all the multi-indices from s to R-1
742 for (int s1=s; s1<R; s1++)
743 {
744 leftFieldArguments[s1] = 0;
745 }
746
747 // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
748 const int & i_s = leftFieldArguments[s];
749 const int & j_s = rightFieldArguments[s];
750
751 int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
752 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
754
755 while (sLeft >= s)
756 {
757 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
758
759 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
760 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761
762 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763
764 for (int s1=s; s1<R; s1++)
765 {
766 rightFieldArguments[s1] = 0;
767 }
768 int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
769 while (sRight >= s)
770 {
771 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
772
773 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
774 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775
776 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777
778 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
779
780 Scalar* G_s;
781
782 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
783 // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
784 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785
786 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
787
788
789 if (s != 0)
790 {
791 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
792 }
793 else
794 {
795 // compute enumeration indices
796 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800
801 if (integralViewRank == 3)
802 {
803 // shape (C,F1,F2)
804 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
805 }
806 else
807 {
808 // shape (F1,F2)
809 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
810 }
811 }
812
813 *G_s += leftValue * G_sp * rightValue;
814
815// approximateFlopCount += 3; // 2 multiplies, 1 sum
816
817 // increment rightField
818 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
819 }
820
821 // increment leftField
822 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
823 }
824 }
825
826 // clear tempStorage for r_next+1 to R
827 if (r_min+1 <= R)
828 {
829 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830 for (int i=scratchOffsetForThread; i<endIndex; i++)
831 {
832 scratchView(i) = 0.0;
833 }
834// auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
835// Kokkos::deep_copy(tempStorageSubview, 0.0);
836 }
837
838 // proceed to next point
839 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
840 }
841 }); // TeamThreadRange parallel_for
842 }
843 }
844// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
845 }
846
847 KOKKOS_INLINE_FUNCTION
848 void operator()( const TeamMember & teamMember ) const
849 {
850 switch (numTensorComponents_)
851 {
852 case 1: run<1>(teamMember); break;
853 case 2: run<2>(teamMember); break;
854 case 3:
855 if (forceNonSpecialized_)
856 run<3>(teamMember);
857 else
858 runSpecialized3(teamMember);
859 break;
860 case 4: run<4>(teamMember); break;
861 case 5: run<5>(teamMember); break;
862 case 6: run<6>(teamMember); break;
863 case 7: run<7>(teamMember); break;
864#ifdef INTREPID2_HAVE_DEBUG
865 default:
866 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
867#endif
868 }
869 }
870
873 {
874 // compute flop count on a single cell, then multiply by the number of cells
875 int flopCount = 0;
876
877 const int R = numTensorComponents_ - 1;
878
879 // we cache the value of M_ab * cellMeasure at each point.
880 // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
881 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
882
883 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
884 {
885 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
886 {
887 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
888 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
889
890 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
891 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
892
893 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
894
895 int flopsPer_i_R_j_R = 0;
896
897 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
898 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
899 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
900 leftFieldArguments[R] = 0;
901
902 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
903 for (int i=0; i<numTensorComponents_; i++)
904 {
905 pointArguments[i] = 0;
906 }
907
908 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
909 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
910 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
911 rightFieldArguments[R] = 0;
912
913 int r = R;
914 while (r >= 0)
915 {
916 // integrate in final component dimension
917 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918 {
919 flopsPer_i_R_j_R += 4;
920 }
921 // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
922// if (0 < pointBounds_[R])
923// {
924// flopsPer_i_R_j_R += pointBounds_[R] * 4;
925// }
926
927 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
928 const int r_min = (r_next >= 0) ? r_next : 0;
929
930 for (int s=R-1; s>=r_min; s--)
931 {
932 // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
933 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
934 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935
936 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
937 }
938
939 // proceed to next point
940 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
941 }
942
943 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
944 }
945 }
946// std::cout << "flop count per cell: " << flopCount << std::endl;
947 return flopCount;
948 }
949
951 int teamSize(const int &maxTeamSizeFromKokkos) const
952 {
953 const int R = numTensorComponents_ - 1;
954 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
955 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
956 }
957
959 size_t team_shmem_size (int team_size) const
960 {
961 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
962 size_t shmem_size = 0;
963
964 if (fad_size_output_ > 0)
965 {
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
967 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
970 }
971 else
972 {
973 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
974 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
977 }
978
979 return shmem_size;
980 }
981 };
982
992 template<class Scalar, class DeviceType, int integralViewRank>
994 {
995 using ExecutionSpace = typename DeviceType::execution_space;
996 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
997 using TeamMember = typename TeamPolicy::member_type;
998
999 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1000 IntegralViewType integralView_;
1001 TensorData<Scalar,DeviceType> leftComponent_;
1002 Data<Scalar,DeviceType> composedTransform_;
1003 TensorData<Scalar,DeviceType> rightComponent_;
1004 TensorData<Scalar,DeviceType> cellMeasures_;
1005 int a_offset_;
1006 int b_offset_;
1007 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1008 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1009 int numTensorComponents_;
1010 int leftFieldOrdinalOffset_;
1011 int rightFieldOrdinalOffset_;
1012
1013 size_t fad_size_output_ = 0; // 0 if not a fad type
1014
1015 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1016 // (this also makes it easier to reorder loops, etc., for further optimizations)
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1019 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1020
1021 int maxFieldsLeft_;
1022 int maxFieldsRight_;
1023 int maxPointCount_;
1024 public:
1026 TensorData<Scalar,DeviceType> leftComponent,
1027 Data<Scalar,DeviceType> composedTransform,
1028 TensorData<Scalar,DeviceType> rightComponent,
1029 TensorData<Scalar,DeviceType> cellMeasures,
1030 int a_offset,
1031 int b_offset,
1032 int leftFieldOrdinalOffset,
1033 int rightFieldOrdinalOffset)
1034 :
1035 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1036 leftComponent_(leftComponent),
1037 composedTransform_(composedTransform),
1038 rightComponent_(rightComponent),
1039 cellMeasures_(cellMeasures),
1040 a_offset_(a_offset),
1041 b_offset_(b_offset),
1042 leftComponentSpan_(leftComponent.extent_int(2)),
1043 rightComponentSpan_(rightComponent.extent_int(2)),
1044 numTensorComponents_(leftComponent.numTensorComponents()),
1045 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1046 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047 {
1048 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1049
1050 const int FIELD_DIM = 0;
1051 const int POINT_DIM = 1;
1052 maxFieldsLeft_ = 0;
1053 maxFieldsRight_ = 0;
1054 maxPointCount_ = 0;
1055 for (int r=0; r<numTensorComponents_; r++)
1056 {
1057 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1058 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1059 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1060 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1061 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1062 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1063 }
1064
1065 // prepare for allocation of temporary storage
1066 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1067
1068 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1069 if (allocateFadStorage)
1070 {
1071 fad_size_output_ = dimension_scalar(integralView_);
1072 }
1073 }
1074
1075 template<size_t maxComponents, size_t numComponents = maxComponents>
1076 KOKKOS_INLINE_FUNCTION
1077 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1078 const Kokkos::Array<int,maxComponents> &bounds) const
1079 {
1080 if (numComponents == 0) return -1;
1081 int r = static_cast<int>(numComponents - 1);
1082 while (arguments[r] + 1 >= bounds[r])
1083 {
1084 arguments[r] = 0; // reset
1085 r--;
1086 if (r < 0) break;
1087 }
1088 if (r >= 0) ++arguments[r];
1089 return r;
1090 }
1091
1093 KOKKOS_INLINE_FUNCTION
1094 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1095 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1096 const int &numComponents) const
1097 {
1098 if (numComponents == 0) return -1;
1099 int r = static_cast<int>(numComponents - 1);
1100 while (arguments[r] + 1 >= bounds[r])
1101 {
1102 arguments[r] = 0; // reset
1103 r--;
1104 if (r < 0) break;
1105 }
1106 if (r >= 0) ++arguments[r];
1107 return r;
1108 }
1109
1110 template<size_t maxComponents, size_t numComponents = maxComponents>
1111 KOKKOS_INLINE_FUNCTION
1112 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1113 const Kokkos::Array<int,maxComponents> &bounds) const
1114 {
1115 if (numComponents == 0) return -1;
1116 int r = static_cast<int>(numComponents - 1);
1117 while (arguments[r] + 1 >= bounds[r])
1118 {
1119 r--;
1120 if (r < 0) break;
1121 }
1122 return r;
1123 }
1124
1126 KOKKOS_INLINE_FUNCTION
1127 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1128 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1129 const int &numComponents) const
1130 {
1131 if (numComponents == 0) return -1;
1132 int r = numComponents - 1;
1133 while (arguments[r] + 1 >= bounds[r])
1134 {
1135 r--;
1136 if (r < 0) break;
1137 }
1138 return r;
1139 }
1140
1141 template<size_t maxComponents, size_t numComponents = maxComponents>
1142 KOKKOS_INLINE_FUNCTION
1143 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1144 const Kokkos::Array<int,maxComponents> &bounds,
1145 const int startIndex) const
1146 {
1147 // the following mirrors what is done in TensorData
1148 if (numComponents == 0)
1149 {
1150 return 0;
1151 }
1152 int enumerationIndex = 0;
1153 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1154 {
1155 enumerationIndex += arguments[r];
1156 enumerationIndex *= bounds[r-1];
1157 }
1158 enumerationIndex += arguments[startIndex];
1159 return enumerationIndex;
1160 }
1161
1162 template<int rank>
1163 KOKKOS_INLINE_FUNCTION
1164 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1165 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1166 {
1167 return integralView(cellDataOrdinal,i,j);
1168 }
1169
1170 template<int rank>
1171 KOKKOS_INLINE_FUNCTION
1172 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1173 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1174 {
1175 return integralView(i,j);
1176 }
1177
1179 KOKKOS_INLINE_FUNCTION
1180 void runSpecialized3( const TeamMember & teamMember ) const
1181 {
1182 constexpr int numTensorComponents = 3;
1183
1184 const int pointBounds_x = pointBounds_[0];
1185 const int pointBounds_y = pointBounds_[1];
1186 const int pointBounds_z = pointBounds_[2];
1187 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188
1189 const int leftFieldBounds_x = leftFieldBounds_[0];
1190 const int rightFieldBounds_x = rightFieldBounds_[0];
1191 const int leftFieldBounds_y = leftFieldBounds_[1];
1192 const int rightFieldBounds_y = rightFieldBounds_[1];
1193 const int leftFieldBounds_z = leftFieldBounds_[2];
1194 const int rightFieldBounds_z = rightFieldBounds_[2];
1195
1196 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1197 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1198 for (unsigned r=0; r<numTensorComponents; r++)
1199 {
1200 leftFieldBounds[r] = leftFieldBounds_[r];
1201 rightFieldBounds[r] = rightFieldBounds_[r];
1202 }
1203
1204 const auto integralView = integralView_;
1205 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1206 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207
1208 const int cellDataOrdinal = teamMember.league_rank();
1209 const int threadNumber = teamMember.team_rank();
1210
1211 const int numThreads = teamMember.team_size(); // num threads
1212 const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1215 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral; // for one Gz value that we sum into before summing into the destination matrix
1216 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1217
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1219 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1220 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1221 if (fad_size_output_ > 0) {
1222 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1223 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1224 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1225 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1226
1227 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1228 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1229 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1230 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1231 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1232 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1233 }
1234 else {
1235 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1236 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1237 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1238 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1239
1240 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1241 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1242 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1243 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1244 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1245 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1246 }
1247
1248// int approximateFlopCount = 0;
1249// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1250
1251 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1252
1253 // synchronize threads
1254 teamMember.team_barrier();
1255
1256 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1257 {
1258 const int a = a_offset_ + a_component;
1259 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1260 {
1261 const int b = b_offset_ + b_component;
1262
1263 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1264 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1265 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1266 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1267 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1268 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1269
1270 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1271 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1272 if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1273 {
1274 const int pointCount = leftTensorComponent_x.extent_int(1);
1275 const int leftRank = leftTensorComponent_x.rank();
1276 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1277 {
1278 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279 }
1280 }
1281 if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1282 {
1283 const int pointCount = leftTensorComponent_y.extent_int(1);
1284 const int leftRank = leftTensorComponent_y.rank();
1285 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1286 {
1287 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288 }
1289 }
1290 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1291 {
1292 const int pointCount = leftTensorComponent_z.extent_int(1);
1293 const int leftRank = leftTensorComponent_z.rank();
1294 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1295 {
1296 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297 }
1298 }
1299 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1300 {
1301 const int pointCount = rightTensorComponent_x.extent_int(1);
1302 const int rightRank = rightTensorComponent_x.rank();
1303 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1304 {
1305 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306 }
1307 }
1308 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1309 {
1310 const int pointCount = rightTensorComponent_y.extent_int(1);
1311 const int rightRank = rightTensorComponent_y.rank();
1312 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1313 {
1314 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315 }
1316 }
1317 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1318 {
1319 const int pointCount = rightTensorComponent_z.extent_int(1);
1320 const int rightRank = rightTensorComponent_z.rank();
1321 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1322 {
1323 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1324 }
1325 }
1326 });
1327
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330 });
1331
1332 // synchronize threads
1333 teamMember.team_barrier();
1334
1335 for (int i0=0; i0<leftFieldBounds_x; i0++)
1336 {
1337 for (int j0=0; j0<rightFieldBounds_x; j0++)
1338 {
1339 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1340 // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1341 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1342
1343 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1344
1345 Gx = 0;
1346 if (fad_size_output_ == 0)
1347 {
1348 // not a Fad type; we're allow to have a vector range
1349 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1350 {
1351 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1352 }, Gx);
1353 }
1354 else
1355 {
1356 for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1357 {
1358 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359 }
1360 }
1361 });
1362
1363 // synchronize threads
1364 teamMember.team_barrier();
1365
1366 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1367 const int i1 = i1j1 % leftFieldBounds_y;
1368 const int j1 = i1j1 / leftFieldBounds_y;
1369
1370 int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1371
1372 int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1373 for (int lz=0; lz<pointBounds_z; lz++)
1374 {
1375 Scalar & Gy = GyIntegrals(Gy_index);
1376 Gy = 0.0;
1377
1378 for (int ly=0; ly<pointBounds_y; ly++)
1379 {
1380 const Scalar & leftValue = leftFields_y(i1,ly);
1381 const Scalar & rightValue = rightFields_y(j1,ly);
1382
1383 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1384
1385 pointEnumerationIndex++;
1386 }
1387 Gy_index++;
1388 }
1389
1390 Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
1391 for (int i2=0; i2<leftFieldBounds_z; i2++)
1392 {
1393 for (int j2=0; j2<rightFieldBounds_z; j2++)
1394 {
1395 Gz = 0.0;
1396
1397 int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1398
1399 for (int lz=0; lz<pointBounds_z; lz++)
1400 {
1401 const Scalar & leftValue = leftFields_z(i2,lz);
1402 const Scalar & rightValue = rightFields_z(j2,lz);
1403
1404 Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1405
1406 Gy_index++;
1407 }
1408
1409 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1410 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1411 // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1412// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1413// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1414
1415 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1416 }
1417 }
1418 });
1419 // synchronize threads
1420 teamMember.team_barrier();
1421 }
1422 }
1423 }
1424 }
1425 }
1426
1427 template<size_t numTensorComponents>
1428 KOKKOS_INLINE_FUNCTION
1429 void run( const TeamMember & teamMember ) const
1430 {
1431 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1432// Kokkos::Array<int,numTensorComponents> pointBounds;
1433// Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1434// Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1435//
1436// int pointsInNonzeroComponentDimensions = 1;
1437// for (unsigned r=0; r<numTensorComponents; r++)
1438// {
1439// pointBounds[r] = pointBounds_[r];
1440// if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1441// leftFieldBounds[r] = leftFieldBounds_[r];
1442// rightFieldBounds[r] = rightFieldBounds_[r];
1443// }
1444//
1445// const int cellDataOrdinal = teamMember.league_rank();
1446// const int numThreads = teamMember.team_size(); // num threads
1447// const int G_k_StackHeight = numTensorComponents - 1; // per thread
1448// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1449// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1450// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1451// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1452// if (fad_size_output_ > 0) {
1453// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1454// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1455// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1456// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1457// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1458// }
1459// else {
1460// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1461// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1462// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1463// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1464// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1465// }
1466//
1469//
1470// constexpr int R = numTensorComponents - 1;
1471//
1472// // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1473//
1474// // synchronize threads
1475// teamMember.team_barrier();
1476//
1477// for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1478// {
1479// const int a = a_offset_ + a_component;
1480// for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1481// {
1482// const int b = b_offset_ + b_component;
1483//
1484// const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1485// const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1486//
1487// const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1488// const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1489//
1490// const int numPointsFirst = leftFirstComponent.extent_int(1);
1491//
1492// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1493// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1494// });
1495//
1496// // synchronize threads
1497// teamMember.team_barrier();
1498//
1499// for (int i0=0; i0<numLeftFieldsFirst; i0++)
1500// {
1501// for (int j0=0; j0<numRightFieldsFirst; j0++)
1502// {
1503// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1504// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1505// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1506// leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1507// });
1508//
1509// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1510// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1511// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1512// rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1513// });
1514//
1515// // synchronize threads
1516// teamMember.team_barrier();
1517//
1518// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1519// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1520// int remainingIndex = pointEnumerationIndexLaterDimensions;
1521//
1522// for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1523// {
1524// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1525// remainingIndex /= pointBounds[d+1];
1526// }
1527// pointArgumentsInLaterDimensions[0] = remainingIndex;
1528//
1529// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1530// for (int d=R; d>0; d--)
1531// {
1532// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1533// tensorPointEnumerationOffset *= pointBounds[d-1];
1534// }
1535//
1536// Scalar integralValue = 0;
1537// if (fad_size_output_ == 0)
1538// {
1539// // not a Fad type; we're allow to have a vector range
1540// Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1541// {
1542// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1543// }, integralValue);
1544// }
1545// else
1546// {
1547// for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1548// {
1549// integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1550// }
1551// }
1552//
1553// G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1554// });
1555//
1556// // synchronize threads
1557// teamMember.team_barrier();
1558//
1559// // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1560// }
1561// }
1562// }
1563// }
1565 }
1566
1567 KOKKOS_INLINE_FUNCTION
1568 void operator()( const TeamMember & teamMember ) const
1569 {
1570 switch (numTensorComponents_)
1571 {
1572 case 1: run<1>(teamMember); break;
1573 case 2: run<2>(teamMember); break;
1574 case 3: runSpecialized3(teamMember); break;
1575 case 4: run<4>(teamMember); break;
1576 case 5: run<5>(teamMember); break;
1577 case 6: run<6>(teamMember); break;
1578 case 7: run<7>(teamMember); break;
1579#ifdef INTREPID2_HAVE_DEBUG
1580 default:
1581 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1582#endif
1583 }
1584 }
1585
1588 {
1589 // compute flop count on a single cell
1590 int flopCount = 0;
1591
1592 constexpr int numTensorComponents = 3;
1593 Kokkos::Array<int,numTensorComponents> pointBounds;
1594 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1595 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1596
1597 int pointsInNonzeroComponentDimensions = 1;
1598 for (unsigned r=0; r<numTensorComponents; r++)
1599 {
1600 pointBounds[r] = pointBounds_[r];
1601 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1602 leftFieldBounds[r] = leftFieldBounds_[r];
1603 rightFieldBounds[r] = rightFieldBounds_[r];
1604 }
1605
1606 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1607 {
1608 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1609 {
1610// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1611// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1612// });
1613 flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1614
1615 for (int i0=0; i0<leftFieldBounds[0]; i0++)
1616 {
1617 for (int j0=0; j0<rightFieldBounds[0]; j0++)
1618 {
1619 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1620// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1621// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1622// int remainingIndex = pointEnumerationIndexLaterDimensions;
1623//
1624// for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1625// {
1626// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1627// remainingIndex /= pointBounds[d+1];
1628// }
1629// pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1630//
1631// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1632// for (int d=R; d>0; d--)
1633// {
1634// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1635// tensorPointEnumerationOffset *= pointBounds[d-1];
1636// }
1637//
1638// Scalar integralValue = 0;
1639// if (fad_size_output_ == 0)
1640// {
1641// // not a Fad type; we're allow to have a vector range
1642// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1643// {
1644// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1645// }, integralValue);
1646// }
1647// else
1648// {
1649// for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1650// {
1651// integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1652// }
1653// }
1654//
1655// GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1656// });
1657
1658
1659 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1660 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1661 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1662
1663// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1664// const int i1 = i1j1 % leftFieldBounds[1];
1665// const int j1 = i1j1 / leftFieldBounds[1];
1666//
1668//
1669// int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1670// for (int lz=0; lz<pointBounds[2]; lz++)
1671// {
1672// Scalar & Gy = GyIntegrals(Gy_index);
1673// Gy = 0.0;
1674//
1675// const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1676// const bool rightRankIs3 = (rightFields_y.rank() == 3);
1677// for (int ly=0; ly<pointBounds[1]; ly++)
1678// {
1679// const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1680// const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1681//
1682// Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1683//
1684// pointEnumerationIndex++;
1685// }
1686// Gy_index++;
1687// }
1688//
1689// for (int i2=0; i2<leftFieldBounds[2]; i2++)
1690// {
1691// for (int j2=0; j2<rightFieldBounds[2]; j2++)
1692// {
1693// Scalar Gz = 0.0;
1694//
1695// int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1696//
1697// const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1698// const bool rightRankIs3 = (rightFields_z.rank() == 3);
1699// for (int lz=0; lz<pointBounds[2]; lz++)
1700// {
1701// const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1702// const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1703//
1704// Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1705//
1706// Gy_index++;
1707// }
1708//
1709// Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1710// Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1711//
1712// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1713// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1714//
1715// if (integralViewRank == 2)
1716// {
1717// integralView_.access(i,j,0) += Gz;
1718// }
1719// else
1720// {
1721// integralView_.access(cellDataOrdinal,i,j) += Gz;
1722// }
1723// }
1724// }
1725// });
1726 }
1727 }
1728 }
1729 }
1730 return flopCount;
1731 }
1732
1734 int teamSize(const int &maxTeamSizeFromKokkos) const
1735 {
1736 // TODO: fix this to match the actual parallelism expressed
1737 const int R = numTensorComponents_ - 1;
1738 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1739 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1740 }
1741
1743 size_t team_shmem_size (int numThreads) const
1744 {
1745 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1746 size_t shmem_size = 0;
1747
1748 const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1749
1750 int pointsInNonzeroComponentDimensions = 1;
1751 for (int d=1; d<numTensorComponents_; d++)
1752 {
1753 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1754 }
1755
1756 if (fad_size_output_ > 0)
1757 {
1758 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1759 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1760 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_); // GzIntegral: entry with x,y,z integrated away
1761 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1762
1763 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1764 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1765 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1766 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1767 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1768 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1769 }
1770 else
1771 {
1772 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1773 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1774 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads); // GzIntegral: entry with x,y,z integrated away
1775 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1776
1777 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1778 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1779 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1780 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1781 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1782 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1783 }
1784
1785 return shmem_size;
1786 }
1787 };
1788 }
1789
1790template<typename DeviceType>
1791template<class Scalar>
1793 const TensorData<Scalar,DeviceType> cellMeasures,
1794 const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1795{
1796 // allocates a (C,F,F) container for storing integral data
1797
1798 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1799 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1800 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1801 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1802 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1803
1804 // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1805 // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1806 // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1807 // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1808 const int CELL_DIM = 0;
1809 const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1810 const auto leftTransform = vectorDataLeft.transform();
1811
1812 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1813 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1814 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1815
1816 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1817 int cellDataExtent = combinedCellDimInfo.dataExtent;
1818
1819 const int numCells = vectorDataLeft.numCells();
1820 const int numFieldsLeft = vectorDataLeft.numFields();
1821 const int numFieldsRight = vectorDataRight.numFields();
1822
1823 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1824 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1825
1826 if (cellVariationType != CONSTANT)
1827 {
1828 Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1829 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1830 }
1831 else
1832 {
1833 Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1834 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1835 }
1836}
1837
1841template<typename DeviceType>
1842template<class Scalar>
1844 const TensorData<Scalar,DeviceType> & cellMeasures,
1845 const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1846 double* approximateFlops)
1847{
1848 using ExecutionSpace = typename DeviceType::execution_space;
1849
1850 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1851 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1852 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1853 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1854 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1855
1856 if (approximateFlops != NULL)
1857 {
1858 *approximateFlops = 0;
1859 }
1860
1861 // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1862 const int integralViewRank = integrals.getUnderlyingViewRank();
1863
1864 if (!sumInto)
1865 {
1866 integrals.clear();
1867 }
1868
1869 const int cellDataExtent = integrals.getDataExtent(0);
1870 const int numFieldsLeft = basisValuesLeft.numFields();
1871 const int numFieldsRight = basisValuesRight.numFields();
1872 const int spaceDim = basisValuesLeft.spaceDim();
1873
1874 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1875
1876 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1877 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1878
1879 // we require that the number of tensor components in the vectors are the same for each vector entry
1880 // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1881 int numTensorComponentsLeft = -1;
1882 const bool isVectorValued = basisValuesLeft.vectorData().isValid();
1883 if (isVectorValued)
1884 {
1885 const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
1886 INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument, "left and right must either both be vector-valued, or both scalar-valued");
1887 const auto &refVectorLeft = basisValuesLeft.vectorData();
1888 int numFamiliesLeft = refVectorLeft.numFamilies();
1889 int numVectorComponentsLeft = refVectorLeft.numComponents();
1890 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1891 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1892 for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1893 {
1894 for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1895 {
1896 const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1897 if (tensorData.numTensorComponents() > 0)
1898 {
1899 if (numTensorComponentsLeft == -1)
1900 {
1901 numTensorComponentsLeft = tensorData.numTensorComponents();
1902 }
1903 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1904 for (int r=0; r<numTensorComponentsLeft; r++)
1905 {
1906 maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1907 }
1908 }
1909 }
1910 }
1911 int numTensorComponentsRight = -1;
1912 const auto &refVectorRight = basisValuesRight.vectorData();
1913 int numFamiliesRight = refVectorRight.numFamilies();
1914 int numVectorComponentsRight = refVectorRight.numComponents();
1915 for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1916 {
1917 for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1918 {
1919 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1920 if (tensorData.numTensorComponents() > 0)
1921 {
1922 if (numTensorComponentsRight == -1)
1923 {
1924 numTensorComponentsRight = tensorData.numTensorComponents();
1925 }
1926 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1927 for (int r=0; r<numTensorComponentsRight; r++)
1928 {
1929 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1930 }
1931 }
1932 }
1933 }
1934 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
1935 }
1936 else
1937 {
1938 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
1939 for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1940 {
1941 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
1942 }
1943
1944 // check that right tensor component count also agrees
1945 for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
1946 {
1947 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
1948 }
1949 }
1950 const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
1951
1952 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
1953 {
1954 // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
1955
1956 // in this case, the integrals in each tensorial direction are entirely separable
1957 // allocate some temporary storage for the integrals in each tensorial direction
1958
1959 // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
1960
1961 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1962 for (int r=0; r<numPointTensorComponents; r++)
1963 {
1964 // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
1965 pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
1966 }
1967
1968 // only one of these will be a valid container:
1969 Kokkos::View<Scalar**, DeviceType> integralView2;
1970 Kokkos::View<Scalar***, DeviceType> integralView3;
1971 if (integralViewRank == 3)
1972 {
1973 integralView3 = integrals.getUnderlyingView3();
1974 }
1975 else
1976 {
1977 integralView2 = integrals.getUnderlyingView2();
1978 }
1979 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
1980 {
1981 int a_offset = 0; // left vector component offset
1982 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
1983
1984 const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
1985 for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
1986 {
1987 TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
1988 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
1989 if (!leftComponent.isValid())
1990 {
1991 a_offset++; // empty components are understood to take up one dimension
1992 continue;
1993 }
1994 const int leftDimSpan = leftComponent.extent_int(2);
1995
1996 const int leftComponentFieldCount = leftComponent.extent_int(0);
1997
1998 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
1999 {
2000 int b_offset = 0; // right vector component offset
2001 int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2002
2003 const int rightVectorComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2004 for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2005 {
2006 TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2007 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2008 if (!rightComponent.isValid())
2009 {
2010 b_offset++; // empty components are understood to take up one dimension
2011 continue;
2012 }
2013 const int rightDimSpan = rightComponent.extent_int(2);
2014
2015 const int rightComponentFieldCount = rightComponent.extent_int(0);
2016
2017 // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2018 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2019 {
2020 b_offset += rightDimSpan;
2021
2022 continue;
2023 }
2024
2025 // if the a, b spans intersect, by assumption they should align exactly
2026 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2027 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2028
2029 const int d_start = a_offset;
2030 const int d_end = d_start + leftDimSpan;
2031
2032 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2033 ComponentIntegralsArray componentIntegrals;
2034 for (int r=0; r<numPointTensorComponents; r++)
2035 {
2036 /*
2037 Four vector data cases to consider:
2038 1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2039 2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2040 3. First container is axial, second arbitrary.
2041 4. First is arbitrary, second axial.
2042
2043 But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2044 the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2045
2046 The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2047 while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2048 */
2049
2050 Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2051 const int numPoints = pointDimensions[r];
2052
2053 // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2054
2055 Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2056 Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2057
2058 const int leftTensorComponentRank = leftTensorComponent.rank();
2059 const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2060 const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2061 const int rightTensorComponentRank = rightTensorComponent.rank();
2062 const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2063 const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2064
2065 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2066
2067 for (int d=d_start; d<d_end; d++)
2068 {
2069 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2070 if (allocateFadStorage)
2071 {
2072 auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2073 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2074 }
2075 else
2076 {
2077 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2078 }
2079
2080 auto componentIntegralView = componentIntegrals[r][d];
2081
2082 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2083
2084 for (int a=0; a<leftTensorComponentDimSpan; a++)
2085 {
2086 Kokkos::parallel_for("compute componentIntegrals", policy,
2087 KOKKOS_LAMBDA (const int &i, const int &j) {
2088 Scalar refSpaceIntegral = 0.0;
2089 for (int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
2090 {
2091 const Scalar & leftValue = ( leftTensorComponentRank == 2) ? leftTensorComponent(i,ptOrdinal) : leftTensorComponent(i,ptOrdinal,a);
2092 const Scalar & rightValue = (rightTensorComponentRank == 2) ? rightTensorComponent(j,ptOrdinal) : rightTensorComponent(j,ptOrdinal,a);
2093 refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2094 }
2095 componentIntegralView(i,j) = refSpaceIntegral;
2096 });
2097 }
2098
2099 if (approximateFlops != NULL)
2100 {
2101 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2102 }
2103 } // d
2104 } // r
2105
2106 ExecutionSpace().fence();
2107
2108 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2109 Kokkos::Array<int,3> lowerBounds {0,0,0};
2110 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2111 // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2112 Kokkos::parallel_for("compute field integrals", policy,
2113 KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2114 const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2115
2116 TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2117 leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2118
2119 TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2120 rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2121
2122 Scalar integralSum = 0.0;
2123 for (int d=d_start; d<d_end; d++)
2124 {
2125 const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2126 const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2127
2128 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2129 // approximateFlopCount++;
2130
2131 Scalar integral_d = 1.0;
2132
2133 for (int r=0; r<numPointTensorComponents; r++)
2134 {
2135 integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2136 // approximateFlopCount++; // product
2137 }
2138 integralSum += leftRightTransform_d * integral_d;
2139 // approximateFlopCount += 2; // multiply and sum
2140
2141 const int i = leftFieldOrdinal + leftFieldOffset;
2142 const int j = rightFieldOrdinal + rightFieldOffset;
2143
2144 if (integralViewRank == 3)
2145 {
2146 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2147 }
2148 else
2149 {
2150 integralView2(i,j) += cellMeasureWeight * integralSum;
2151 }
2152 }
2153 });
2154 b_offset += rightDimSpan;
2155 } // rightVectorComponentOrdinal
2156 } // rightFamilyOrdinal
2157 a_offset += leftDimSpan;
2158 } // leftVectorComponentOrdinal
2159 } // leftFamilyOrdinal
2160
2161 if (approximateFlops != NULL)
2162 {
2163 // TODO: check the accuracy of this
2164 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2165 }
2166 }
2167 else // general case (not axis-aligned + affine tensor-product structure)
2168 {
2169 // prepare composed transformation matrices
2170 const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2171 const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2172 const bool transposeLeft = true;
2173 const bool transposeRight = false;
2174// auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2175// timer->start();
2176 // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2
2177 const bool matrixTransform = (leftTransform.rank() == 4) || (rightTransform.rank() == 4);
2178 Data<Scalar,DeviceType> composedTransform;
2179 // invalid/empty transforms are used when the identity is intended.
2180 if (leftTransform.isValid() && rightTransform.isValid())
2181 {
2182 if (matrixTransform)
2183 {
2184 composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2185 composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2186
2187 // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2188 if (approximateFlops != NULL)
2189 {
2190 *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2191 }
2192 }
2193 else
2194 {
2195 composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2196 composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2197
2198 // re-cast composedTranform as a rank 4 (C,P,D,D) object -- a 1 x 1 matrix at each (C,P).
2199 const int newRank = 4;
2200 auto extents = composedTransform.getExtents();
2201 auto variationTypes = composedTransform.getVariationTypes();
2202 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2203 if (approximateFlops != NULL)
2204 {
2205 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2206 }
2207 }
2208 }
2209 else if (leftTransform.isValid())
2210 {
2211 // rightTransform is the identity
2212 composedTransform = leftTransform;
2213 }
2214 else if (rightTransform.isValid())
2215 {
2216 // leftTransform is the identity
2217 composedTransform = rightTransform;
2218 }
2219 else
2220 {
2221 // both left and right transforms are identity
2222 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2223 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2224
2225 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2226 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2227 composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2228 }
2229
2230// timer->stop();
2231// std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2232// timer->reset();
2233
2234 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2235 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2236 const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2237 const int rightComponentCount = isVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2238
2239 int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2240 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2241 {
2242 // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2243 // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2244 int a_offset = 0;
2245 bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2246 for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2247 {
2248 TensorData<Scalar,DeviceType> leftComponent = isVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2249 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2250 if (!leftComponent.isValid())
2251 {
2252 // represents zero
2253 a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2254 continue;
2255 }
2256
2257 int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2258 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2259 {
2260 // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2261 // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2262 bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2263 int b_offset = 0;
2264 for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2265 {
2266 TensorData<Scalar,DeviceType> rightComponent = isVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2267 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2268 if (!rightComponent.isValid())
2269 {
2270 // represents zero
2271 b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2272 continue;
2273 }
2274
2275 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2276
2277 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2278 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2279
2280 // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2281 // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2282 // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2283 // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2284 // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2285 // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2286 // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2287 bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2288 bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2289
2290 // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2291 // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2292 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2293 {
2294 ExecutionSpace().fence();
2295 }
2296 haveLaunchedContributionToCurrentFamilyLeft = true;
2297 haveLaunchedContributionToCurrentFamilyRight = true;
2298
2299 if (integralViewRank == 2)
2300 {
2301 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2302 {
2303 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2304
2305 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2306 const int teamSize = functor.teamSize(recommendedTeamSize);
2307
2308 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2309
2310 Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2311
2312 if (approximateFlops != NULL)
2313 {
2314 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2315 }
2316 }
2317 else
2318 {
2319 auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2320
2321 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2322 const int teamSize = functor.teamSize(recommendedTeamSize);
2323
2324 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2325
2326 Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2327
2328 if (approximateFlops != NULL)
2329 {
2330 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2331 }
2332 }
2333 }
2334 else if (integralViewRank == 3)
2335 {
2336 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2337 {
2338 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2339
2340 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2341 const int teamSize = functor.teamSize(recommendedTeamSize);
2342
2343 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2344
2345 Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2346
2347 if (approximateFlops != NULL)
2348 {
2349 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2350 }
2351 }
2352 else
2353 {
2354 auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2355
2356 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2357 const int teamSize = functor.teamSize(recommendedTeamSize);
2358
2359 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2360
2361 Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2362
2363 if (approximateFlops != NULL)
2364 {
2365 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2366 }
2367 }
2368 }
2369 b_offset += isVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2370 }
2371 rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2372 }
2373 a_offset += isVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2374 }
2375 leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2376 }
2377 }
2378// if (approximateFlops != NULL)
2379// {
2380// std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2381// }
2382 ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2383}
2384
2385} // end namespace Intrepid2
2386
2387#endif
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say,...
@ CONSTANT
does not vary
@ GENERAL
arbitrary variation
@ BLOCK_PLUS_DIAGONAL
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimen...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes)
Creates a new Data object with the same underlying view, but with the specified logical rank,...
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts,...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
Provides support for structure-aware integration.
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
Structure-preserving representation of transformed vector data; reference space values and transforma...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container.
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container.
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container.
Struct expressing all variation information about a Data object in a single dimension,...