Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
43#define STOKHOS_CUDA_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
44
45#include <iostream>
46
47#include "Kokkos_Core.hpp"
48
49#include "Stokhos_Multiply.hpp"
52
54
55#include "cuda_profiler_api.h"
56
57namespace Stokhos {
58
59//----------------------------------------------------------------------------
60
61#if 0
62
63// This version is slow because it requires more shared memory and has
64// much more frequent reductions
65template< typename TensorScalar ,
66 typename MatrixScalar ,
67 typename VectorScalar >
68class Multiply<
69 BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
70 MatrixScalar, Kokkos::Cuda >,
71 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
72 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
73{
74public:
75
76 typedef Kokkos::Cuda execution_space ;
77 typedef execution_space::size_type size_type ;
78
79 typedef SimpleTiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
80 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
81 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
82
83 class ProductTensorLoop {
84 public:
85
86 const matrix_type m_A;
87 const vector_type m_x;
88 const vector_type m_y;
89 const size_type m_block_size ;
90
91 ProductTensorLoop( const matrix_type & A,
92 const vector_type & x,
93 const vector_type & y,
94 const size_type block_size )
95 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
96
97 __device__
98 void operator()(void) const
99 {
100 // Number of bases in the stochastic system:
101 const size_type dim = m_A.block.dimension();
102 const size_type max_tile_size = m_A.block.max_jk_tile_size();
103
104 volatile VectorScalar * const sh_x_k =
105 kokkos_impl_cuda_shared_memory<VectorScalar>();
106 volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
107 volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
108 volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
109 volatile VectorScalar * const sh_y = sh_A_j+m_block_size*max_tile_size;
110
111 const size_type nid = blockDim.x * blockDim.y;
112 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
113
114 // blockIdx.x == row in the deterministic (finite element) system
115 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
116 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
117 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
118 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
119 if (remBlock > 0) ++numBlock;
120
121 // Loop over i tiles
122 const size_type n_i_tile = m_A.block.num_i_tiles();
123 for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
124 const size_type i_begin = m_A.block.i_begin(i_tile);
125 const size_type i_size = m_A.block.i_size(i_tile);
126
127 // Zero y
128 for (size_type i=tid; i<i_size; i+=nid) {
129 sh_y[i] = 0.0;
130 }
131
132 // Loop over finite element column blocks.
133 size_type iBlockEntry = iBlockEntryBeg;
134 for (size_type block=0; block<numBlock; ++block,
135 iBlockEntry+=m_block_size) {
136 const size_type block_size =
137 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
138
139 // Loop over j tiles
140 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
141 for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
142 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
143 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
144
145 // Wait for X and A to be used in the previous iteration
146 // before reading new values.
147 __syncthreads();
148
149 // Coalesced read j-blocks of X and A into shared memory
150 for (size_type col=0; col<block_size; ++col) {
151 const size_type iBlockColumn =
152 m_A.graph.entries(iBlockEntry + col);
153 const VectorScalar * const x_j =
154 &m_x(j_begin, iBlockColumn);
155 const MatrixScalar * const A_j =
156 &m_A.values(j_begin, iBlockEntry + col);
157 for (size_type j=tid; j<j_size; j+=nid) {
158 sh_x_j[col+j*m_block_size] = x_j[j];
159 sh_A_j[col+j*m_block_size] = A_j[j];
160 }
161 }
162
163 // Loop over k tiles
164 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
165 for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
166 const size_type k_begin =
167 m_A.block.k_begin(i_tile, j_tile, k_tile);
168 const size_type k_size =
169 m_A.block.k_size(i_tile, j_tile, k_tile);
170
171 // Wait for X and A to be used in the previous iteration
172 // before reading new values.
173 __syncthreads();
174
175 // Coalesced read j-blocks of X and A into shared memory
176 for (size_type col=0; col<block_size; ++col) {
177 const size_type iBlockColumn =
178 m_A.graph.entries(iBlockEntry + col);
179 const VectorScalar * const x_k =
180 &m_x(k_begin, iBlockColumn);
181 const MatrixScalar * const A_k =
182 &m_A.values(k_begin, iBlockEntry + col);
183 for (size_type k=tid; k<k_size; k+=nid) {
184 sh_x_k[col+k*m_block_size] = x_k[k];
185 sh_A_k[col+k*m_block_size] = A_k[k];
186 }
187 }
188
189 __syncthreads(); // wait for X and A to be read
190
191 // Loop over stochastic rows in this tile
192 for (size_type i=threadIdx.y; i<i_size; i+=blockDim.y) {
193 VectorScalar s = 0;
194
195 // Product tensor entries which this warp will iterate:
196 const size_type lBeg =
197 m_A.block.entry_begin(i_tile, j_tile, k_tile, i);
198 const size_type lEnd =
199 m_A.block.entry_end(i_tile, j_tile, k_tile, i);
200
201 // Loop through sparse tensor contributions with
202 // coalesced reads.
203 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
204 const size_type kj = m_A.block.coord( l );
205 const TensorScalar v = m_A.block.value( l );
206 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
207 const size_type k = ( kj >> 16 ) * m_block_size ;
208
209 for ( size_type col = 0; col < block_size; ++col ) {
210 s += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
211 sh_A_k[col+k] * sh_x_j[col+j] );
212 }
213 }
214
215 // Reduction of 'y' within 'blockDim.x'
216 if (blockDim.x >= 2) s += shfl_down(s, 1, blockDim.x);
217 if (blockDim.x >= 4) s += shfl_down(s, 2, blockDim.x);
218 if (blockDim.x >= 8) s += shfl_down(s, 4, blockDim.x);
219 if (blockDim.x >= 16) s += shfl_down(s, 8, blockDim.x);
220 if (blockDim.x >= 32) s += shfl_down(s, 16, blockDim.x);
221 if ( threadIdx.x == 0 ) sh_y[i] += s;
222
223 } // i-loop
224
225 } // k-tile loop
226
227 } // j-tile loop
228
229 } // block column loop
230
231 // Wait for all threads to complete the i-tile
232 __syncthreads();
233
234 // Store sum for this tile back in global memory
235 for (size_type i=tid; i<i_size; i+=nid) {
236 m_y( i+i_begin , blockIdx.x ) = sh_y[i];
237 }
238
239 } // i-tile loop
240
241 } // operator()
242
243 }; // ProductTensorLoop
244
245 //------------------------------------
246
247 static void apply( const matrix_type & A ,
248 const vector_type & x ,
249 const vector_type & y )
250 {
251 const size_type row_count = A.graph.row_map.extent(0) - 1;
252 const size_type tensor_dimension = A.block.dimension();
253 const size_type i_tile_size = A.block.max_i_tile_size();
254 const size_type jk_tile_size = A.block.max_jk_tile_size();
255
256#ifdef STOKHOS_DEBUG
257 const size_type nWarp = 4; // Use fewer warps in debug mode to prevent
258 // launch failures
259#else
260 const size_type nWarp = 16;
261#endif
262 const size_type threads_per_row = 16;
263 const size_type rows_per_warp =
264 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
265 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
266 const dim3 dGrid( row_count , 1 , 1 );
267
268 const size_type shcap =
269 Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
270 size_type bs =
271 (shcap / sizeof(VectorScalar) - i_tile_size) / (4*jk_tile_size);
272 if (bs % 2 == 0) --bs;
273 const size_type block_size_max = 31;
274 const size_type block_size = std::min(bs, block_size_max);
275 // const int block_size = 9;
276 const size_type shmem =
277 sizeof(VectorScalar) * (4*jk_tile_size*block_size + i_tile_size);
278
279 /*
280 ProductTensorLoop kernel( A , x , y, block_size )
281 int res;
282 cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
283 */
284
285#if 0
286
287 std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
288 << std::endl
289 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
290 << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
291 << " block_size(" << block_size << ")" << std::endl
292 << " shmem(" << shmem/1024 << " kB)" << std::endl
293 << " row_count(" << row_count << ")" << std::endl
294 << " tensor_dimension(" << tensor_dimension << ")" << std::endl
295 << " tile_size(" << tile_size << ")" << std::endl
296 << " num_tiles(" << num_tiles << ")" << std::endl
297 ;
298#endif
299 //cudaProfilerStart();
300 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
301 ( ProductTensorLoop( A , x , y, block_size ) );
302 //cudaProfilerStop();
303 }
304};
305
306#else
307
308// This is currently the fastest version, but doesn't have fully coalesced
309// reads of the sparse tensor, nor coalesced writes of y
310template< typename TensorScalar ,
311 typename MatrixScalar ,
312 typename VectorScalar >
314 BlockCrsMatrix< SimpleTiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
315 MatrixScalar, Kokkos::Cuda >,
316 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
317 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
318{
319public:
320
321 typedef Kokkos::Cuda execution_space ;
322 typedef execution_space::size_type size_type ;
323
326 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
327
328 class ProductTensorLoop {
329 public:
330
335
337 const vector_type & x,
338 const vector_type & y,
339 const size_type block_size )
340 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
341
342 __device__
343 void operator()(void) const
344 {
345 // Number of bases in the stochastic system:
346 const size_type dim = m_A.block.dimension();
347 const size_type max_tile_size = m_A.block.max_jk_tile_size();
348
349 const size_type nid = blockDim.x * blockDim.y;
350 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
351 // const size_type nid2 = nid / 2;
352 // const bool lower = tid < nid2;
353 // const size_type tid2 = lower ? tid : tid - nid2;
354
355 volatile VectorScalar * const sh_x_k =
356 kokkos_impl_cuda_shared_memory<VectorScalar>();
357 volatile VectorScalar * const sh_x_j = sh_x_k+m_block_size*max_tile_size;
358 volatile VectorScalar * const sh_A_k = sh_x_j+m_block_size*max_tile_size;
359 volatile VectorScalar * const sh_A_j = sh_A_k+m_block_size*max_tile_size;
360 //volatile size_type * const sh_c = (volatile size_type*) (sh_A_j + m_block_size*max_tile_size);
361 __shared__ volatile size_type sh_c[32];
362
363 // blockIdx.y == row in the deterministic (finite element) system
364 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.y ];
365 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.y + 1 ];
366 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
367 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
368 if (remBlock > 0) ++numBlock;
369
370 // blockIdx.y == i_tile
371 const size_type i_tile = blockIdx.x;
372 const size_type i_begin = m_A.block.i_begin(i_tile);
373 const size_type i_size = m_A.block.i_size(i_tile);
374 const size_type i1 = threadIdx.y;
375 //const size_type i2 = threadIdx.y + blockDim.y;
376 if (i1 >= i_size) return;
377
378 VectorScalar s1 = 0;
379 //VectorScalar s2 = 0;
380
381 // Loop over finite element column blocks.
382 size_type iBlockEntry = iBlockEntryBeg;
383 for (size_type block=0; block<numBlock; ++block,
384 iBlockEntry+=m_block_size) {
385 const size_type block_size =
386 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
387
388 __syncthreads();
389 // for (size_type col=tid; col<block_size; col+=nid)
390 // sh_c[col] = m_A.graph.entries(iBlockEntry + col);
391 if (tid == 0) {
392 for (size_type col=0; col<block_size; ++col)
393 sh_c[col] = m_A.graph.entries(iBlockEntry + col);
394 }
395
396 // Loop over j tiles
397 const size_type n_j_tile = m_A.block.num_j_tiles(i_tile);
398 for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
399 const size_type j_begin = m_A.block.j_begin(i_tile, j_tile);
400 const size_type j_size = m_A.block.j_size(i_tile, j_tile);
401
402 // Wait for X and A to be used in the previous iteration
403 // before reading new values.
404 __syncthreads();
405
406 // Coalesced read j-blocks of X and A into shared memory
407 for (size_type col=0; col<block_size; ++col) {
408 const VectorScalar * const x_j = &m_x(j_begin, sh_c[col]);
409 const MatrixScalar * const A_j = &m_A.values(j_begin, iBlockEntry + col);
410 for (size_type j=tid; j<j_size; j+=nid) {
411 sh_x_j[col+j*m_block_size] = x_j[j];
412 sh_A_j[col+j*m_block_size] = A_j[j];
413 }
414 // if (lower) {
415 // for (size_type j=tid2; j<j_size; j+=nid2)
416 // sh_x_j[col+j*m_block_size] = x_j[j];
417 // }
418 // else {
419 // for (size_type j=tid2; j<j_size; j+=nid2)
420 // sh_A_j[col+j*m_block_size] = A_j[j];
421 // }
422 }
423
424 // Loop over k tiles
425 const size_type n_k_tile = m_A.block.num_k_tiles(i_tile, j_tile);
426 for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
427 const size_type k_begin =
428 m_A.block.k_begin(i_tile, j_tile, k_tile);
429 const size_type k_size =
430 m_A.block.k_size(i_tile, j_tile, k_tile);
431
432 // Wait for X and A to be used in the previous iteration
433 // before reading new values.
434 __syncthreads();
435
436 // Coalesced read j-blocks of X and A into shared memory
437 for (size_type col=0; col<block_size; ++col) {
438 const VectorScalar * const x_k =
439 &m_x(k_begin, sh_c[col]);
440 const MatrixScalar * const A_k =
441 &m_A.values(k_begin, iBlockEntry + col);
442 for (size_type k=tid; k<k_size; k+=nid) {
443 sh_x_k[col+k*m_block_size] = x_k[k];
444 sh_A_k[col+k*m_block_size] = A_k[k];
445 }
446 // if (lower) {
447 // for (size_type k=tid2; k<k_size; k+=nid2)
448 // sh_x_k[col+k*m_block_size] = x_k[k];
449 // }
450 // else {
451 // for (size_type k=tid2; k<k_size; k+=nid2)
452 // sh_A_k[col+k*m_block_size] = A_k[k];
453 // }
454 }
455
456 __syncthreads(); // wait for X and A to be read
457
458 // Product tensor entries which this warp will iterate:
459 size_type lBeg =
460 m_A.block.entry_begin(i_tile, j_tile, k_tile, i1);
461 size_type lEnd =
462 m_A.block.entry_end(i_tile, j_tile, k_tile, i1);
463
464 // Loop through sparse tensor contributions with
465 // coalesced reads (these aren't fully coalesced unless
466 // blockDim.x >= 16 for double precision. We need to reorder
467 // the tensor entries for fewer threads per row).
468 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
469 const size_type kj = m_A.block.coord( l );
470 const TensorScalar v = m_A.block.value( l );
471 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
472 const size_type k = ( kj >> 16 ) * m_block_size ;
473
474 for ( size_type col = 0; col < block_size; ++col ) {
475 s1 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
476 sh_A_k[col+k] * sh_x_j[col+j] );
477 }
478 }
479
480 // if (i2 < i_size) {
481 // // Product tensor entries which this warp will iterate:
482 // size_type lBeg =
483 // m_A.block.entry_begin(i_tile, j_tile, k_tile, i2);
484 // size_type lEnd =
485 // m_A.block.entry_end(i_tile, j_tile, k_tile, i2);
486
487 // // Loop through sparse tensor contributions with
488 // // coalesced reads.
489 // for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
490 // const size_type kj = m_A.block.coord( l );
491 // const TensorScalar v = m_A.block.value( l );
492 // const size_type j = ( kj & 0x0ffff ) * m_block_size ;
493 // const size_type k = ( kj >> 16 ) * m_block_size ;
494
495 // for ( size_type col = 0; col < block_size; ++col ) {
496 // s2 += v * ( sh_A_j[col+j] * sh_x_k[col+k] +
497 // sh_A_k[col+k] * sh_x_j[col+j] );
498 // }
499 // }
500 // }
501
502 } // k-tile loop
503
504 } // j-tile loop
505
506 } // block column loop
507
508 // Wait for all threads to complete the i-tile
509 //__syncthreads();
510
511 // Reduction of 'y' within 'blockDim.x'
512 if (blockDim.x >= 2) s1 += shfl_down(s1, 1, blockDim.x);
513 if (blockDim.x >= 4) s1 += shfl_down(s1, 2, blockDim.x);
514 if (blockDim.x >= 8) s1 += shfl_down(s1, 4, blockDim.x);
515 if (blockDim.x >= 16) s1 += shfl_down(s1, 8, blockDim.x);
516 if (blockDim.x >= 32) s1 += shfl_down(s1, 16, blockDim.x);
517 if ( threadIdx.x == 0 ) m_y( i1+i_begin , blockIdx.y ) = s1;
518
519 // if (i2 < i_size) {
520 // if (blockDim.x >= 2) s2 += shfl_down(s2, 1, blockDim.x);
521 // if (blockDim.x >= 4) s2 += shfl_down(s2, 2, blockDim.x);
522 // if (blockDim.x >= 8) s2 += shfl_down(s2, 4, blockDim.x);
523 // if (blockDim.x >= 16) s2 += shfl_down(s2, 8, blockDim.x);
524 // if (blockDim.x >= 32) s2 += shfl_down(s2, 16, blockDim.x);
525 // if ( threadIdx.x == 0 ) m_y( i2+i_begin , blockIdx.y ) = s2;
526 // }
527
528 } // operator()
529
530 }; // ProductTensorLoop
531
532 //------------------------------------
533
534 static void apply( const matrix_type & A ,
535 const vector_type & x ,
536 const vector_type & y )
537 {
538 const size_type row_count = A.graph.row_map.extent(0) - 1;
539 const size_type tensor_dimension = A.block.dimension();
540 const size_type tile_size = A.block.max_jk_tile_size();
541 const size_type n_i_tile = A.block.num_i_tiles();
542
543#ifdef STOKHOS_DEBUG
544 const size_type nWarp = 2; // Use fewer warps in debug mode to prevent
545 // launch failures
546#else
547 const size_type nWarp = 16;
548#endif
549 const size_type threads_per_row = 4;
550 const size_type rows_per_warp =
551 Kokkos::Impl::CudaTraits::WarpSize / threads_per_row;
552 const dim3 dBlock( threads_per_row , rows_per_warp*nWarp , 1 );
553 const dim3 dGrid( n_i_tile , row_count , 1 );
554
555 const size_type shcap =
556 Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
557 size_type bs = ((shcap / sizeof(VectorScalar)) / tile_size) / 4;
558 if (bs % 2 == 0) --bs;
559 const size_type block_size_max = 31;
560 const size_type block_size = std::min(bs, block_size_max);
561 // const int block_size = 9;
562 const size_type shmem = sizeof(VectorScalar)*(4*block_size*tile_size); // + sizeof(size_type)*block_size_max;
563
564 // Try sorting rows based on number of non-zeros, split threads between
565 // A, x reads, do the diagonals properly
566 // Splitting threads between A & x reads doesn't seem to improve things
567
568 /*
569 ProductTensorLoop kernel( A , x , y, block_size )
570 int res;
571 cuFuncGetAttribute(&res, CU_FUNC_ATTRIBUTE_NUM_REGS, &kernel.operator());
572 */
573
574#if 0
575
576 std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
577 << std::endl
578 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
579 << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
580 << " block_size(" << block_size << ")" << std::endl
581 << " shmem(" << shmem/1024 << " kB)" << std::endl
582 << " row_count(" << row_count << ")" << std::endl
583 << " tensor_dimension(" << tensor_dimension << ")" << std::endl
584 << " tile_size(" << tile_size << ")" << std::endl
585 << " num_i_tiles(" << n_i_tile << ")" << std::endl
586 ;
587#endif
588 //cudaProfilerStart();
589 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
590 ( ProductTensorLoop( A , x , y, block_size ) );
591 //cudaProfilerStop();
592 }
593};
594
595#endif
596
597//----------------------------------------------------------------------------
598//----------------------------------------------------------------------------
599
600} // namespace Stokhos
601
602#endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
Kokkos::DefaultExecutionSpace execution_space
CRS matrix of dense blocks.
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION Scalar shfl_down(const Scalar &val, const int &delta, const int &width)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:265
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267