1 #include <goofit/Faddeeva.h>
2 #include <goofit/PDFs/basic/VoigtianPdf.h>
3 #include <goofit/detail/Complex.h>
8 #define M_2PI 6.28318530717958
9 //#define ROOT2 1.41421356
11 // tables for Pade approximation
12 __constant__ fptype C[7]
13 = {65536.0, -2885792.0, 69973904.0, -791494704.0, 8962513560.0, -32794651890.0, 175685635125.0};
14 __constant__ fptype D[7]
15 = {192192.0, 8648640.0, 183783600.0, 2329725600.0, 18332414100.0, 84329104860.0, 175685635125.0};
17 //#define UNROLL_LOOP 1
20 __constant__ fptype n1[12] = {0.25, 1.0, 2.25, 4.0, 6.25, 9.0, 12.25, 16.0, 20.25, 25.0, 30.25, 36.0};
21 __constant__ fptype e1[12] = {0.7788007830714049,
30 1.388794386496402e-11,
31 7.287724095819692e-14,
32 2.319522830243569e-16};
34 // table 2: coefficients for h = 0.53
35 __constant__ fptype n2[12]
36 = {0.2809, 1.1236, 2.5281, 4.4944, 7.0225, 10.1124, 13.7641, 17.9776, 22.7529, 28.09, 33.9889, 40.4496};
37 __constant__ fptype e2[12] = {0.7551038420890235,
45 1.313835773243312e-10,
46 6.319285885175346e-13,
47 1.733038792213266e-15,
48 2.709954036083074e-18};
50 __device__ fpcomplex device_Faddeeva_2(const fpcomplex &z) {
51 fptype *n, *e, t, u, r, s, d, f, g, h;
55 s = thrust::norm(z); // NB: norm2 is correct although CPU version calls the function 'norm'.
58 // use Pade approximation
60 v = exp(zz); // Note lower-case! This is our own already-templated exp function for thrust::complex, no need for
61 // float/double define.
65 for(i = 1; i <= 6; i++) {
70 return fptype(1.0) / v + fpcomplex(0.0, M_2_SQRTPI) * c / d2 * z * v;
74 // select default table 1
79 // if z is too close to a pole select table 2
80 if(fabs(z.imag()) < 0.01 && fabs(z.real()) < 6.01) {
81 // h = modf(2*fabs(z.real),&g);
82 // Equivalent to above. Do this way because nvcc only knows about double version of modf.
83 h = fabs(z.real()) * 2;
87 if(h < 0.02 || h > 0.98) {
94 d = (z.imag() - z.real()) * (z.imag() + z.real());
95 f = 4 * z.real() * z.real() * z.imag() * z.imag();
99 for(i = 0; i < 12; i++) {
101 u = e[i] / (t * t + f);
108 c = r * fpcomplex(z.imag() * (u + 2.0 * g), z.real() * (u + 2.0 * h));
110 if(z.imag() < M_2PI) {
117 g = 2.0 * exp(d - u) / (s * s + f * f);
118 u = 2.0 * z.real() * z.imag();
121 c += g * fpcomplex((h * f - t * s), -(h * s + t * f));
128 __device__ fpcomplex device_Faddeeva_2(const fpcomplex &z) {
129 fptype u, s, d, f, g, h;
132 s = norm2(z); // NB: norm2 is correct although CPU version calls the function 'norm'.
135 // use Pade approximation
136 fpcomplex zz = z * z;
137 v = exp(zz); // Note lower-case! This is our own already-templated exp function for thrust::complex, no need for
138 // float/double define.
142 for(int i = 1; i < 7; ++i) {
147 return fptype(1.0) / v + fpcomplex(0.0, M_2_SQRTPI) * c / d2 * z * v;
150 // use trapezoid rule
151 fptype r = M_1_PI * 0.5;
152 bool useDefault = true;
154 // if z is too close to a pole select table 2
155 if(fabs(z.imag) < 0.01 && fabs(z.real) < 6.01) {
156 // h = modf(2*fabs(z.real),&g);
157 // Equivalent to above. Do this way because nvcc only knows about double version of modf.
158 h = fabs(z.real) * 2;
162 if(h < 0.02 || h > 0.98) {
168 d = (z.imag - z.real) * (z.imag + z.real);
169 f = 4 * z.real * z.real * z.imag * z.imag;
172 fptype currentN = (useDefault ? 0.25 : 0.2809);
173 fptype currentE = (useDefault ? 0.7788007830714049 : 0.7551038420890235);
174 fptype t = d + currentN;
175 u = currentE / (t * t + f);
176 g += (s + currentN) * u;
177 h += (s - currentN) * u;
179 currentN = (useDefault ? 1.0 : 1.1236);
180 currentE = (useDefault ? 0.3678794411714423 : 0.3251072991205958);
182 u = currentE / (t * t + f);
183 g += (s + currentN) * u;
184 h += (s - currentN) * u;
186 currentN = (useDefault ? 2.25 : 2.5281);
187 currentE = (useDefault ? 1.053992245618643e-1 : 7.981051630007964e-2);
189 u = currentE / (t * t + f);
190 g += (s + currentN) * u;
191 h += (s - currentN) * u;
193 currentN = (useDefault ? 4.0 : 4.4944);
194 currentE = (useDefault ? 1.930454136227709e-3 : 0.891593719995219e-3);
196 u = currentE / (t * t + f);
197 g += (s + currentN) * u;
198 h += (s - currentN) * u;
200 currentN = (useDefault ? 6.25 : 7.0225);
201 currentE = (useDefault ? 4.785117392129009e-6 : 1.052755021528803e-6);
203 u = currentE / (t * t + f);
204 g += (s + currentN) * u;
205 h += (s - currentN) * u;
207 currentN = (useDefault ? 9.0 : 10.1124);
208 currentE = (useDefault ? 1.605228055185612e-9 : 1.313835773243312e-10);
210 u = currentE / (t * t + f);
211 g += (s + currentN) * u;
212 h += (s - currentN) * u;
214 currentN = (useDefault ? 12.25 : 13.7641);
215 currentE = (useDefault ? 7.287724095819692e-14 : 1.733038792213266e-15);
217 u = currentE / (t * t + f);
218 g += (s + currentN) * u;
219 h += (s - currentN) * u;
221 currentN = (useDefault ? 16.0 : 17.9776);
222 currentE = (useDefault ? 1.831563888873418e-2 : 1.117138143353082e-2);
224 u = currentE / (t * t + f);
225 g += (s + currentN) * u;
226 h += (s - currentN) * u;
228 currentN = (useDefault ? 20.25 : 22.7529);
229 currentE = (useDefault ? 1.234098040866795e-4 : 4.057331392320188e-5);
231 u = currentE / (t * t + f);
232 g += (s + currentN) * u;
233 h += (s - currentN) * u;
235 currentN = (useDefault ? 25.0 : 28.09);
236 currentE = (useDefault ? 1.125351747192591e-7 : 1.557498087816203e-8);
238 u = currentE / (t * t + f);
239 g += (s + currentN) * u;
240 h += (s - currentN) * u;
242 currentN = (useDefault ? 30.25 : 33.9889);
243 currentE = (useDefault ? 1.388794386496402e-11 : 6.319285885175346e-13);
245 u = currentE / (t * t + f);
246 g += (s + currentN) * u;
247 h += (s - currentN) * u;
249 currentN = (useDefault ? 36.0 : 40.4496);
250 currentE = (useDefault ? 2.319522830243569e-16 : 2.709954036083074e-18);
252 u = currentE / (t * t + f);
253 g += (s + currentN) * u;
254 h += (s - currentN) * u;
257 c = r * fpcomplex(z.imag * (u + 2.0 * g), z.real * (u + 2.0 * h));
266 g = 2.0 * exp(d - u) / (s * s + f * f);
267 u = 2.0 * z.real * z.imag;
270 c += g * fpcomplex((h * f - t * s), -(h * s + t * f));
277 __device__ fptype device_Voigtian(fptype *evt, fptype *p, unsigned int *indices) {
279 fptype m = p[indices[1]];
280 fptype w = p[indices[2]];
281 fptype s = p[indices[3]];
283 // return constant for zero width and sigma
284 if((0 == s) && (0 == w))
287 // assert(s > 0); // No device-side assert?!
292 // Breit-Wigner for zero sigma
294 return (1 / (arg * arg + 0.25 * w * w));
296 fptype coef = -0.5 / (s * s);
298 // Gauss for zero width
300 return exp(coef * arg * arg);
302 // actual Voigtian for non-trivial width and sigma
303 // fptype c = 1./(ROOT2*s);
304 fptype c = 0.707106781187; // 1/root(2)
306 fptype a = 0.5 * c * w;
309 fpcomplex v = device_Faddeeva_2(z);
311 #define rsqrtPi 0.5641895835477563
312 return c * rsqrtPi * v.real();
315 __device__ device_function_ptr ptr_to_Voigtian = device_Voigtian;
317 __host__ VoigtianPdf::VoigtianPdf(std::string n, Observable _x, Variable m, Variable s, Variable w)
319 std::vector<unsigned int> pindices;
320 pindices.push_back(registerParameter(m));
321 pindices.push_back(registerParameter(s));
322 pindices.push_back(registerParameter(w));
323 GET_FUNCTION_ADDR(ptr_to_Voigtian);
324 initialize(pindices);
327 } // namespace GooFit