GooFit  v2.1.3
GlobalCudaDefines.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <thrust/detail/config.h> // __host__, __device__ defines
4 #include <thrust/system_error.h> // Error types
5 
6 #define _USE_MATH_DEFINES
7 #include <cmath>
8 #include <string>
9 
10 namespace GooFit {
11 extern int host_callnumber;
12 }
13 
14 #ifdef _MSC_VER
15 #define _Pragma(x) __pragma(x)
16 #endif
17 
18 // Allow code to work on non-CUDA systems (beyond what is provided with thrust)
19 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
20 #define __align__(n)
21 inline void cudaDeviceSynchronize() {}
22 #define __shared__
23 #define __constant__
24 #endif
25 
26 // Specialty copies
27 #ifdef __CUDACC__
28 #define MEMCPY(target, source, count, direction) cudaMemcpy(target, source, count, direction)
29 #define MEMCPY_TO_SYMBOL(target, source, count, offset, direction) \
30  cudaMemcpyToSymbol(target, source, count, offset, direction)
31 #define GET_FUNCTION_ADDR(fname) \
32  { \
33  cudaMemcpyFromSymbol((void **)&host_fcn_ptr, fname, sizeof(void *)); \
34  GOOFIT_DEBUG("Using function {} in {}, {}:{}", #fname, __func__, __FILE__, __LINE__); \
35  }
36 #define MEMCPY_FROM_SYMBOL(target, source, count, offset, direction) \
37  cudaMemcpyFromSymbol(target, source, count, offset, direction)
38 
39 // This automatically selects the correct CUDA arch and expands the __ldg intrinsic to work on arbitrary types
40 // CUDACC only
41 #include <generics/ldg.h>
42 #define RO_CACHE(x) __ldg(&x)
43 
44 #else
45 
46 #define MEMCPY(target, source, count, direction) memcpy((char *)target, source, count)
47 #define MEMCPY_TO_SYMBOL(target, source, count, offset, direction) memcpy(((char *)target) + offset, source, count)
48 #define MEMCPY_FROM_SYMBOL(target, source, count, offset, direction) \
49  memcpy((char *)target, ((char *)source) + offset, count)
50 #define GET_FUNCTION_ADDR(fname) \
51  { \
52  host_fcn_ptr = (void *)fname; \
53  GOOFIT_DEBUG("Using function {} in {}, {}:{}", #fname, __func__, __FILE__, __LINE__); \
54  }
55 #define RO_CACHE(x) x
56 #endif
57 
58 #ifdef _OPENMP
59 #include <omp.h>
60 #endif
61 
62 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP || THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
63 #define THREADIDX (omp_get_thread_num())
64 #define BLOCKDIM (omp_get_num_threads())
65 #define BLOCKIDX (0)
66 #define THREAD_SYNCH _Pragma("omp barrier")
67 
68 #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
69 #define THREADIDX (0)
70 #define BLOCKDIM (1)
71 #define BLOCKIDX (0)
72 #define THREAD_SYNCH
73 
74 #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
75 #define THREADIDX (threadIdx.x)
76 #define BLOCKDIM (blockDim.x)
77 #define BLOCKIDX (blockIdx.x)
78 #define THREAD_SYNCH __syncthreads();
79 #endif
80 
81 // CUDA errors (only needed for explicit memory tranfers)
82 // For CUDA case, just use existing errors
83 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
84 #include <driver_types.h>
85 #else
86 enum cudaError_t { cudaSuccess, cudaErrorMemoryAllocation };
87 #endif
88 
89 namespace GooFit {
90 cudaError_t gooMalloc(void **target, size_t bytes);
91 cudaError_t gooFree(void *ptr);
92 
93 // Allow a switch to control single vs. double precision
94 #ifndef GOOFIT_SINGLES
95 
96 using fptype = double;
97 #define root2 1.4142135623730951
98 #define invRootPi 0.5641895835477563
99 
100 #else
101 
102 typedef float fptype;
103 #define root2 1.4142135623730951f
104 #define invRootPi 0.5641895835477563f
105 
106 #endif
107 } // namespace GooFit
108 
109 // Often faster than pow, and works with ints on CUDA<8
110 #define POW2(x) ((x) * (x))
111 #define POW3(x) ((x) * (x) * (x))
112 
113 // Add rsqrt for everyone
114 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 350)
115 template <typename T>
116 __host__ __device__ T rsqrt(T val) {
117  return 1.0 / sqrt(val);
118 }
119 #endif
120 
121 // Fix for bug in pow(double,int) for CUDA 7 and 7.5 (device problem only)
122 #if defined(__CUDACC__) && __CUDACC_VER_MAJOR__ < 8
123 __host__ __device__ inline GooFit::fptype pow(GooFit::fptype x, int y) { return pow(x, (GooFit::fptype)y); }
124 #endif
double fptype
cudaError_t gooFree(void *ptr)
__host__ __device__ T rsqrt(T val)
cudaError_t gooMalloc(void **target, size_t bytes)
int host_callnumber
Definition: PdfBase.cpp:33