Highly Efficient FFT for Exascale: HeFFTe v2.4
Loading...
Searching...
No Matches
heffte_backend_cuda.h
1/*
2 -- heFFTe --
3 Univ. of Tennessee, Knoxville
4 @date
5*/
6
7#ifndef HEFFTE_BACKEND_CUDA_H
8#define HEFFTE_BACKEND_CUDA_H
9
10#include "heffte_r2r_executor.h"
11
12#ifdef Heffte_ENABLE_CUDA
13
14#include <cuda_runtime_api.h>
15#include <cuda.h>
16#include <cufft.h>
17#include "heffte_backend_vector.h"
18
19#ifdef Heffte_ENABLE_MAGMA
20#include "heffte_magma_helpers.h"
21#endif
22
37
38namespace heffte{
39
44namespace cuda {
49 inline void check_error(cudaError_t status, const char *function_name){
50 if (status != cudaSuccess)
51 throw std::runtime_error(std::string(function_name) + " failed with message: " + cudaGetErrorString(status));
52 }
53
57 inline void check_error(cufftResult status, const char *function_name){
58 if (status != CUFFT_SUCCESS)
59 throw std::runtime_error(std::string(function_name) + " failed with error code: " + std::to_string(status));
60 }
61
67 template<typename precision_type, typename index>
68 void convert(cudaStream_t stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);
75 template<typename precision_type, typename index>
76 void convert(cudaStream_t stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);
77
82 template<typename scalar_type, typename index>
83 void scale_data(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor);
84
91 template<typename scalar_type, typename index>
92 void direct_pack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
99 template<typename scalar_type, typename index>
100 void direct_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[]);
107 template<typename scalar_type, typename index>
108 void transpose_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
109 index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,
110 scalar_type const source[], scalar_type destination[]);
111
118 template<typename precision>
119 static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[]);
121 template<typename precision>
122 static void post_forward(cudaStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
124 template<typename precision>
125 static void pre_backward(cudaStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
127 template<typename precision>
128 static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[]);
130 static int compute_extended_length(int length){
131 return 4 * length;
132 }
133 };
134
140 template<typename precision>
141 static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[]);
143 template<typename precision>
144 static void post_forward(cudaStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
146 template<typename precision>
147 static void pre_backward(cudaStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
149 template<typename precision>
150 static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[]);
152 static int compute_extended_length(int length){
154 }
155 };
156
159 template<typename precision>
160 static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[]);
162 template<typename precision>
163 static void post_forward(cudaStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
165 template<typename precision>
166 static void pre_backward(cudaStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
168 template<typename precision>
169 static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[]);
171 static int compute_extended_length(int length){
172 return 4 * ( length-1 );
173 }
174 };
175
176}
177
178namespace backend{
183 template<> struct is_enabled<cufft> : std::true_type{};
188 template<> struct is_enabled<cufft_cos> : std::true_type{};
193 template<> struct is_enabled<cufft_sin> : std::true_type{};
194
195 template<> struct is_enabled<cufft_cos1> : std::true_type{};
196
201 template<>
202 struct device_instance<tag::gpu>{
204 device_instance(cudaStream_t new_stream = nullptr) : _stream(new_stream){}
206 cudaStream_t stream(){ return _stream; }
208 cudaStream_t stream() const{ return _stream; }
210 void synchronize_device() const{ cuda::check_error(cudaStreamSynchronize(_stream), "device sync"); }
212 mutable cudaStream_t _stream;
214 using stream_type = cudaStream_t;
215 };
216
221 template<> struct default_backend<tag::gpu>{
223 using type = cufft;
224 };
225
230 template<> struct data_manipulator<tag::gpu> {
232 using stream_type = cudaStream_t;
236 template<typename scalar_type>
237 static scalar_type* allocate(cudaStream_t, size_t num_entries){
238 void *new_data = nullptr;
239 cuda::check_error(cudaMalloc(&new_data, num_entries * sizeof(scalar_type)), "cudaMalloc()");
240 return reinterpret_cast<scalar_type*>(new_data);
241 }
242
243 template<typename scalar_type>
244 static void free(cudaStream_t, scalar_type *pntr){
245 if (pntr == nullptr) return;
246 cuda::check_error(cudaFree(pntr), "cudaFree()");
247 }
248
249 template<typename scalar_type>
250 static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
251 if (stream == nullptr)
252 cuda::check_error(cudaMemcpy(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice), "data_manipulator::copy_n()");
253 else
254 cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice, stream), "data_manipulator::copy_n()");
255 }
256
257 template<typename scalar_type>
258 static void copy_n(cudaStream_t stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){
259 cuda::convert(stream, static_cast<long long>(num_entries), source, destination);
260 }
261
262 template<typename scalar_type>
263 static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){
264 cuda::convert(stream, static_cast<long long>(num_entries), source, destination);
265 }
266
267 template<typename scalar_type>
268 static void copy_device_to_host(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
269 cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToHost, stream),
270 "device_to_host (cuda)");
271 }
272
273 template<typename scalar_type>
274 static void copy_device_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
275 cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyDeviceToDevice, stream),
276 "device_to_device (cuda)");
277 }
278
279 template<typename scalar_type>
280 static void copy_host_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
281 cuda::check_error(cudaMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), cudaMemcpyHostToDevice, stream),
282 "host_to_device (cuda)");
283 }
284 };
285
290 template<>
295 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
296 };
297
301 template<>
306 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
307 };
308
312 template<>
317 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
318 };
319
320 template<>
325 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
326 };
327}
328
333template<> struct is_ccomplex<cufftComplex> : std::true_type{};
338template<> struct is_zcomplex<cufftDoubleComplex> : std::true_type{};
339
346template<typename scalar_type> struct plan_cufft{
356 plan_cufft(cudaStream_t stream, int size, int batch, int stride, int dist){
357 size_t work_size = 0;
358 cuda::check_error(cufftCreate(&plan), "plan_cufft::cufftCreate()");
360 cufftMakePlanMany(plan, 1, &size, &size, stride, dist, &size, stride, dist,
361 (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
362 batch, &work_size),
363 "plan_cufft::cufftMakePlanMany() 1D"
364 );
365
366 if (stream != nullptr)
367 cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
368 }
369
380 plan_cufft(cudaStream_t stream, int size1, int size2, std::array<int, 2> const &embed, int batch, int stride, int dist){
381 size_t work_size = 0;
382 int size[2] = {size2, size1};
383
384 cuda::check_error(cufftCreate(&plan), "plan_cufft::cufftCreate()");
385 if (embed[0] == 0 and embed[1] == 0){
387 cufftMakePlanMany(plan, 2, size, size, stride, dist, size, stride, dist,
388 (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
389 batch, &work_size),
390 "plan_cufft::cufftMakePlanMany() 2D"
391 );
392 }else{
394 cufftMakePlanMany(plan, 2, size, const_cast<int*>(embed.data()), stride, dist, const_cast<int*>(embed.data()), stride, dist,
395 (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z,
396 batch, &work_size),
397 "plan_cufft::cufftMakePlanMany() 2D with embedding"
398 );
399 }
400
401 if (stream != nullptr)
402 cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
403 }
404
405 plan_cufft(cudaStream_t stream, int size1, int size2, int size3){
407 cufftPlan3d(&plan, size3, size2, size1,
408 (std::is_same<scalar_type, std::complex<float>>::value) ? CUFFT_C2C : CUFFT_Z2Z),
409 "plan_cufft::cufftPlan3d()"
410 );
411 if (stream != nullptr)
412 cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
413 }
414
415 ~plan_cufft(){ cufftDestroy(plan); }
417 operator cufftHandle() const{ return plan; }
418
419private:
421 cufftHandle plan;
422};
423
437public:
445 template<typename index>
446 cufft_executor(cudaStream_t active_stream, box3d<index> const box, int dimension) :
447 stream(active_stream),
448 size(box.size[dimension]), size2(0),
449 howmanyffts(fft1d_get_howmany(box, dimension)),
450 stride(fft1d_get_stride(box, dimension)),
451 dist((dimension == box.order[0]) ? size : 1),
452 blocks((dimension == box.order[1]) ? box.osize(2) : 1),
453 block_stride(box.osize(0) * box.osize(1)),
454 total_size(box.count()),
455 embed({0, 0})
456 {}
457
458 template<typename index>
459 cufft_executor(cudaStream_t active_stream, box3d<index> const box, int dir1, int dir2) :
460 stream(active_stream),
461 size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
462 blocks(1), block_stride(0), total_size(box.count()), embed({0, 0})
463 {
464 int odir1 = box.find_order(dir1);
465 int odir2 = box.find_order(dir2);
466
467 if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
468 stride = 1;
469 dist = size * size2;
470 howmanyffts = box.size[2];
471 }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
472 stride = box.size[0];
473 dist = 1;
474 howmanyffts = box.size[0];
475 }else{ // case of directions (0, 2)
476 stride = 1;
477 dist = size;
478 embed = {static_cast<int>(box.size[2]), static_cast<int>(box.size[1] * box.size[0])};
479 howmanyffts = box.size[1];
480 }
481 }
482
483 template<typename index>
484 cufft_executor(cudaStream_t active_stream, box3d<index> const box) :
485 stream(active_stream),
486 size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
487 stride(0), dist(0),
488 blocks(1), block_stride(0),
489 total_size(box.count()),
490 embed({0, 0})
491 {}
492
494 void forward(std::complex<float> data[], std::complex<float>*) const override{
495 make_plan(ccomplex_plan);
496 for(int i=0; i<blocks; i++){
497 cufftComplex* block_data = reinterpret_cast<cufftComplex*>(data + i * block_stride);
498 cuda::check_error(cufftExecC2C(*ccomplex_plan, block_data, block_data, CUFFT_FORWARD), "cufft_executor::cufftExecC2C() forward");
499 }
500 }
501
502 void backward(std::complex<float> data[], std::complex<float>*) const override{
503 make_plan(ccomplex_plan);
504 for(int i=0; i<blocks; i++){
505 cufftComplex* block_data = reinterpret_cast<cufftComplex*>(data + i * block_stride);
506 cuda::check_error(cufftExecC2C(*ccomplex_plan, block_data, block_data, CUFFT_INVERSE), "cufft_executor::cufftExecC2C() backward");
507 }
508 }
509
510 void forward(std::complex<double> data[], std::complex<double>*) const override{
511 make_plan(zcomplex_plan);
512 for(int i=0; i<blocks; i++){
513 cufftDoubleComplex* block_data = reinterpret_cast<cufftDoubleComplex*>(data + i * block_stride);
514 cuda::check_error(cufftExecZ2Z(*zcomplex_plan, block_data, block_data, CUFFT_FORWARD), "cufft_executor::cufftExecZ2Z() forward");
515 }
516 }
517
518 void backward(std::complex<double> data[], std::complex<double>*) const override{
519 make_plan(zcomplex_plan);
520 for(int i=0; i<blocks; i++){
521 cufftDoubleComplex* block_data = reinterpret_cast<cufftDoubleComplex*>(data + i * block_stride);
522 cuda::check_error(cufftExecZ2Z(*zcomplex_plan, block_data, block_data, CUFFT_INVERSE), "cufft_executor::cufftExecZ2Z() backward");
523 }
524 }
525
527 void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
528 cuda::convert(stream, total_size, indata, outdata);
529 forward(outdata, workspace);
530 }
531
532 void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
533 backward(indata, workspace);
534 cuda::convert(stream, total_size, indata, outdata);
535 }
536
537 void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
538 cuda::convert(stream, total_size, indata, outdata);
539 forward(outdata, workspace);
540 }
541
542 void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
543 backward(indata, workspace);
544 cuda::convert(stream, total_size, indata, outdata);
545 }
546
548 int box_size() const override{ return total_size; }
550 size_t workspace_size() const override{ return 0; }
551
552private:
554 template<typename scalar_type>
555 void make_plan(std::unique_ptr<plan_cufft<scalar_type>> &plan) const{
556 if (not plan){
557 if (dist == 0)
558 plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, size2, howmanyffts));
559 else if (size2 == 0)
560 plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, howmanyffts, stride, dist));
561 else
562 plan = std::unique_ptr<plan_cufft<scalar_type>>(new plan_cufft<scalar_type>(stream, size, size2, embed, howmanyffts, stride, dist));
563 }
564 }
565
566 mutable cudaStream_t stream;
567
568 int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
569 std::array<int, 2> embed;
570 mutable std::unique_ptr<plan_cufft<std::complex<float>>> ccomplex_plan;
571 mutable std::unique_ptr<plan_cufft<std::complex<double>>> zcomplex_plan;
572};
573
580template<typename scalar_type> struct plan_cufft_r2c{
592 plan_cufft_r2c(cudaStream_t stream, direction dir, int size, int batch, int stride, int rdist, int cdist){
593 static_assert(std::is_same<scalar_type, float>::value or std::is_same<scalar_type, double>::value,
594 "plan_cufft_r2c can be used only with scalar_type float or double.");
595
596 size_t work_size = 0;
597 cuda::check_error(cufftCreate(&plan), "plan_cufft_r2c::cufftCreate()");
598
600 cufftMakePlanMany(plan, 1, &size, &size, stride,
601 (dir == direction::forward) ? rdist : cdist,
602 &size, stride,
603 (dir == direction::forward) ? cdist : rdist,
604 (std::is_same<scalar_type, float>::value) ?
605 (dir == direction::forward) ? CUFFT_R2C : CUFFT_C2R
606 : (dir == direction::forward) ? CUFFT_D2Z : CUFFT_Z2D,
607 batch, &work_size),
608 "plan_cufft_r2c::cufftMakePlanMany() (forward)"
609 );
610 if (stream != nullptr)
611 cuda::check_error( cufftSetStream(plan, stream), "cufftSetStream()");
612 }
613
614 ~plan_cufft_r2c(){ cufftDestroy(plan); }
616 operator cufftHandle() const{ return plan; }
617
618private:
620 cufftHandle plan;
621};
622
632public:
642 template<typename index>
643 cufft_executor_r2c(cudaStream_t active_stream, box3d<index> const box, int dimension) :
644 stream(active_stream),
645 size(box.size[dimension]),
646 howmanyffts(fft1d_get_howmany(box, dimension)),
647 stride(fft1d_get_stride(box, dimension)),
648 blocks((dimension == box.order[1]) ? box.osize(2) : 1),
649 rdist((dimension == box.order[0]) ? size : 1),
650 cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
651 rblock_stride(box.osize(0) * box.osize(1)),
652 cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
653 rsize(box.count()),
654 csize(box.r2c(dimension).count())
655 {}
656
658 void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
659 make_plan(sforward, direction::forward);
660 if (blocks == 1 or rblock_stride % 2 == 0){
661 for(int i=0; i<blocks; i++){
662 cufftReal *rdata = const_cast<cufftReal*>(indata + i * rblock_stride);
663 cufftComplex* cdata = reinterpret_cast<cufftComplex*>(outdata + i * cblock_stride);
664 cuda::check_error(cufftExecR2C(*sforward, rdata, cdata), "cufft_executor::cufftExecR2C()");
665 }
666 }else{
667 // need to create a temporary copy of the data since cufftExecR2C() requires aligned input
668 float *rdata = align<backend::cufft>::pntr(reinterpret_cast<float*>(workspace));
669 for(int i=0; i<blocks; i++){
670 backend::data_manipulator<tag::gpu>::copy_n(stream, indata + i * rblock_stride, rblock_stride, rdata);
671 cufftComplex* cdata = reinterpret_cast<cufftComplex*>(outdata + i * cblock_stride);
672 cuda::check_error(cufftExecR2C(*sforward, rdata, cdata), "cufft_executor::cufftExecR2C()");
673 }
674 }
675 }
676
677 void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
678 make_plan(sbackward, direction::backward);
679 if (blocks == 1 or rblock_stride % 2 == 0){
680 for(int i=0; i<blocks; i++){
681 cufftComplex* cdata = const_cast<cufftComplex*>(reinterpret_cast<cufftComplex const*>(indata + i * cblock_stride));
682 cuda::check_error(cufftExecC2R(*sbackward, cdata, outdata + i * rblock_stride), "cufft_executor::cufftExecC2R()");
683 }
684 }else{
685 float *odata = align<backend::cufft>::pntr(reinterpret_cast<float*>(workspace));
686 for(int i=0; i<blocks; i++){
687 cufftComplex* cdata = const_cast<cufftComplex*>(reinterpret_cast<cufftComplex const*>(indata + i * cblock_stride));
688 cuda::check_error(cufftExecC2R(*sbackward, cdata, odata), "cufft_executor::cufftExecC2R()");
689 backend::data_manipulator<tag::gpu>::copy_n(stream, odata, rblock_stride, outdata + i * rblock_stride);
690 }
691 }
692 }
693
694 void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
695 make_plan(dforward, direction::forward);
696 if (blocks == 1 or rblock_stride % 2 == 0){
697 for(int i=0; i<blocks; i++){
698 cufftDoubleReal *rdata = const_cast<cufftDoubleReal*>(indata + i * rblock_stride);
699 cufftDoubleComplex* cdata = reinterpret_cast<cufftDoubleComplex*>(outdata + i * cblock_stride);
700 cuda::check_error(cufftExecD2Z(*dforward, rdata, cdata), "cufft_executor::cufftExecD2Z()");
701 }
702 }else{
703 double *rdata = align<backend::cufft>::pntr(reinterpret_cast<double*>(workspace));
704 for(int i=0; i<blocks; i++){
705 backend::data_manipulator<tag::gpu>::copy_n(stream, indata + i * rblock_stride, rblock_stride, rdata);
706 cufftDoubleComplex* cdata = reinterpret_cast<cufftDoubleComplex*>(outdata + i * cblock_stride);
707 cuda::check_error(cufftExecD2Z(*dforward, rdata, cdata), "cufft_executor::cufftExecD2Z()");
708 }
709 }
710 }
711
712 void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
713 make_plan(dbackward, direction::backward);
714 if (blocks == 1 or rblock_stride % 2 == 0){
715 for(int i=0; i<blocks; i++){
716 cufftDoubleComplex* cdata = const_cast<cufftDoubleComplex*>(reinterpret_cast<cufftDoubleComplex const*>(indata + i * cblock_stride));
717 cuda::check_error(cufftExecZ2D(*dbackward, cdata, outdata + i * rblock_stride), "cufft_executor::cufftExecZ2D()");
718 }
719 }else{
720 double *odata = align<backend::cufft>::pntr(reinterpret_cast<double*>(workspace));
721 for(int i=0; i<blocks; i++){
722 cufftDoubleComplex* cdata = const_cast<cufftDoubleComplex*>(reinterpret_cast<cufftDoubleComplex const*>(indata + i * cblock_stride));
723 cuda::check_error(cufftExecZ2D(*dbackward, cdata, odata), "cufft_executor::cufftExecZ2D()");
724 backend::data_manipulator<tag::gpu>::copy_n(stream, odata, rblock_stride, outdata + i * rblock_stride);
725 }
726 }
727 }
728
730 int box_size() const override{ return rsize; }
732 int complex_size() const override{ return csize; }
734 size_t workspace_size() const override{ return (blocks == 1 or rblock_stride % 2 == 0) ? 0 : rblock_stride / 2 + 1; }
735
736private:
738 template<typename scalar_type>
739 void make_plan(std::unique_ptr<plan_cufft_r2c<scalar_type>> &plan, direction dir) const{
740 if (!plan) plan = std::unique_ptr<plan_cufft_r2c<scalar_type>>(new plan_cufft_r2c<scalar_type>(stream, dir, size, howmanyffts, stride, rdist, cdist));
741 }
742
743 cudaStream_t stream;
744
745 int size, howmanyffts, stride, blocks;
746 int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
747 mutable std::unique_ptr<plan_cufft_r2c<float>> sforward;
748 mutable std::unique_ptr<plan_cufft_r2c<double>> dforward;
749 mutable std::unique_ptr<plan_cufft_r2c<float>> sbackward;
750 mutable std::unique_ptr<plan_cufft_r2c<double>> dbackward;
751};
752
765
778
790
791template<> struct one_dim_backend<backend::cufft_cos1>{
793 using executor_r2c = void;
794};
795
800template<> struct direct_packer<tag::gpu>{
802 template<typename scalar_type, typename index>
803 void pack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
804 cuda::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);
805 }
806
807 template<typename scalar_type, typename index>
808 void unpack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
809 cuda::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);
810 }
811};
812
817template<> struct transpose_packer<tag::gpu>{
819 template<typename scalar_type, typename index>
820 void pack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
821 direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer
822 }
823
824 template<typename scalar_type, typename index>
825 void unpack(cudaStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
826 cuda::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
827 plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);
828 }
829};
830
831namespace data_scaling {
836 template<typename scalar_type, typename index>
837 void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor){
838 cuda::scale_data(stream, static_cast<long long>(num_entries), data, scale_factor);
839 }
840
844 template<typename precision_type, typename index>
845 void apply(cudaStream_t stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
846 apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
847 }
848}
849
854template<> struct default_plan_options<backend::cufft>{
856 static const bool use_reorder = false;
857};
858
862template<> struct default_plan_options<backend::cufft_cos>{
864 static const bool use_reorder = true;
865};
866template<> struct default_plan_options<backend::cufft_cos1>{
868 static const bool use_reorder = true;
869};
870
874template<> struct default_plan_options<backend::cufft_sin>{
876 static const bool use_reorder = true;
877};
878
879}
880
881#endif
882
883#endif /* HEFFTE_BACKEND_FFTW_H */
Wrapper to cuFFT API for real-to-complex transform with shortening of the data.
Definition heffte_backend_cuda.h:631
cufft_executor_r2c(cudaStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition heffte_backend_cuda.h:643
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition heffte_backend_cuda.h:677
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition heffte_backend_cuda.h:694
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition heffte_backend_cuda.h:658
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_cuda.h:734
int box_size() const override
Returns the size of the box with real data.
Definition heffte_backend_cuda.h:730
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition heffte_backend_cuda.h:712
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition heffte_backend_cuda.h:732
Wrapper around the cuFFT API.
Definition heffte_backend_cuda.h:436
void forward(std::complex< float > data[], std::complex< float > *) const override
Forward fft, float-complex case.
Definition heffte_backend_cuda.h:494
void backward(std::complex< float > data[], std::complex< float > *) const override
Backward fft, float-complex case.
Definition heffte_backend_cuda.h:502
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition heffte_backend_cuda.h:542
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the real data to complex and performs float-complex forward transform.
Definition heffte_backend_cuda.h:527
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition heffte_backend_cuda.h:532
cufft_executor(cudaStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition heffte_backend_cuda.h:484
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_cuda.h:550
cufft_executor(cudaStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition heffte_backend_cuda.h:446
cufft_executor(cudaStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition heffte_backend_cuda.h:459
void backward(std::complex< double > data[], std::complex< double > *) const override
Backward fft, double-complex case.
Definition heffte_backend_cuda.h:518
int box_size() const override
Returns the size of the box.
Definition heffte_backend_cuda.h:548
void forward(std::complex< double > data[], std::complex< double > *) const override
Forward fft, double-complex case.
Definition heffte_backend_cuda.h:510
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the real data to complex and performs double-complex forward transform.
Definition heffte_backend_cuda.h:537
Base class for all backend executors.
Definition heffte_common.h:561
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition heffte_common.h:594
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition heffte_common.h:570
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition heffte_common.h:566
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:169
direction
Indicates the direction of the FFT (internal use only).
Definition heffte_common.h:652
@ backward
Inverse DFT transform.
Definition heffte_common.h:656
@ forward
Forward DFT transform.
Definition heffte_common.h:654
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition heffte_backend_cuda.h:837
void transpose_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void check_error(cudaError_t status, const char *function_name)
Checks the status of a CUDA command and in case of a failure, converts it to a C++ exception.
Definition heffte_backend_cuda.h:49
void scale_data(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(cudaStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
void convert(cudaStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
Contains type tags and templates metadata for the various backends.
Definition heffte_backend_cuda.h:178
Cuda specific methods.
Definition heffte_backend_cuda.h:44
Specialization for the CPU case.
Definition heffte_backend_cuda.h:831
Contains internal type-tags.
Definition heffte_common.h:30
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
static float * pntr(float *p)
Align for float.
Definition heffte_common.h:604
tag::gpu location
The cufft library uses data on the gpu device.
Definition heffte_backend_cuda.h:293
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition heffte_backend_cuda.h:295
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition heffte_backend_cuda.h:325
tag::gpu location
The cufft library uses data on the gpu device.
Definition heffte_backend_cuda.h:323
tag::gpu location
The cufft library uses data on the gpu device.
Definition heffte_backend_cuda.h:304
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition heffte_backend_cuda.h:306
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the cuda vector container.
Definition heffte_backend_cuda.h:317
tag::gpu location
The cufft library uses data on the gpu device.
Definition heffte_backend_cuda.h:315
Defines the container for the temporary buffers.
Definition heffte_common.h:237
Type-tag for the Cosine Transform type 1 using the cuFFT backend.
Definition heffte_common.h:178
Type-tag for the Cosine Transform using the cuFFT backend.
Definition heffte_common.h:168
Type-tag for the Sine Transform using the cuFFT backend.
Definition heffte_common.h:173
Type-tag for the cuFFT backend.
Definition heffte_common.h:162
cudaStream_t stream_type
The stream type for the device.
Definition heffte_backend_cuda.h:232
static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition heffte_backend_cuda.h:250
static void copy_n(cudaStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition heffte_backend_cuda.h:263
static void free(cudaStream_t, scalar_type *pntr)
Free memory.
Definition heffte_backend_cuda.h:244
backend::device_instance< tag::gpu > backend_device
Defines the backend_device.
Definition heffte_backend_cuda.h:234
static void copy_device_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition heffte_backend_cuda.h:274
static void copy_host_to_device(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition heffte_backend_cuda.h:280
static void copy_device_to_host(cudaStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition heffte_backend_cuda.h:268
static void copy_n(cudaStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition heffte_backend_cuda.h:258
static scalar_type * allocate(cudaStream_t, size_t num_entries)
Allocate memory.
Definition heffte_backend_cuda.h:237
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
cufft type
Set the cufft tag.
Definition heffte_backend_cuda.h:223
Defines inverse mapping from the location tag to a default backend tag.
Definition heffte_common.h:430
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
void synchronize_device() const
Syncs the execution with the queue, no-op in the CPU case.
Definition heffte_common.h:418
void * stream_type
The type for the internal stream, the cpu uses just a void pointer.
Definition heffte_common.h:420
device_instance(void *=nullptr)
Empty constructor.
Definition heffte_common.h:410
void * stream()
Returns the nullptr.
Definition heffte_common.h:414
Allows to define whether a specific backend interface has been enabled.
Definition heffte_common.h:226
A generic container that describes a 3d box of indexes.
Definition heffte_geometry.h:67
Definition heffte_backend_cuda.h:157
static void pre_backward(cudaStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_cuda.h:171
static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void post_forward(cudaStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition heffte_backend_cuda.h:116
static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_cuda.h:130
static void pre_backward(cudaStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_forward(cudaStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition heffte_backend_cuda.h:138
static void post_forward(cudaStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void post_backward(cudaStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(cudaStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(cudaStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_cuda.h:152
static const bool use_reorder
The reshape operations will not transpose the data.
Definition heffte_backend_cuda.h:856
static const bool use_reorder
The reshape operations will not transpose the data.
Definition heffte_backend_cuda.h:868
static const bool use_reorder
The reshape operations will not transpose the data.
Definition heffte_backend_cuda.h:864
static const bool use_reorder
The reshape operations will not transpose the data.
Definition heffte_backend_cuda.h:876
Defines a set of default plan options for a given backend.
Definition heffte_common.h:761
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_cuda.h:803
void unpack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition heffte_backend_cuda.h:808
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
Struct to specialize to allow HeFFTe to recognize custom single precision complex types.
Definition heffte_utils.h:252
Struct to specialize to allow HeFFTe to recognize custom double precision complex types.
Definition heffte_utils.h:270
cufft_executor_r2c executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_cuda.h:763
cufft_executor executor
Defines the complex-to-complex executor.
Definition heffte_backend_cuda.h:761
real2real_executor< backend::cufft, cuda::cos_pre_pos_processor > executor
Defines the complex-to-complex executor.
Definition heffte_backend_cuda.h:774
void executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_cuda.h:776
void executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_cuda.h:788
real2real_executor< backend::cufft, cuda::sin_pre_pos_processor > executor
Defines the complex-to-complex executor.
Definition heffte_backend_cuda.h:786
Indicates the structure that will be used by the fft backend.
Definition heffte_common.h:663
Holds the plan for a pack/unpack operation.
Definition heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition heffte_pack3d.h:40
Plan for the r2c single and double precision transform.
Definition heffte_backend_cuda.h:580
plan_cufft_r2c(cudaStream_t stream, direction dir, int size, int batch, int stride, int rdist, int cdist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_cuda.h:592
~plan_cufft_r2c()
Destructor, deletes the plan.
Definition heffte_backend_cuda.h:614
Wrapper around cufftHandle plans, set for float or double complex.
Definition heffte_backend_cuda.h:346
plan_cufft(cudaStream_t stream, int size1, int size2, int size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition heffte_backend_cuda.h:405
plan_cufft(cudaStream_t stream, int size, int batch, int stride, int dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_cuda.h:356
plan_cufft(cudaStream_t stream, int size1, int size2, std::array< int, 2 > const &embed, int batch, int stride, int dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_cuda.h:380
~plan_cufft()
Destructor, deletes the plan.
Definition heffte_backend_cuda.h:415
Template algorithm for the Sine and Cosine transforms.
Definition heffte_r2r_executor.h:192
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition heffte_common.h:45
void pack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_cuda.h:820
void unpack(cudaStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition heffte_backend_cuda.h:825
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition heffte_pack3d.h:116