GooFit  v2.1.3
TddpPdf.cu
Go to the documentation of this file.
1 #include <goofit/Error.h>
2 #include <goofit/PDFs/physics/TddpPdf.h>
3 
4 #include <thrust/transform_reduce.h>
5 
6 #ifdef GOOFIT_MPI
7 #include <mpi.h>
8 #endif
9 
10 namespace GooFit {
11 
12 const int resonanceOffset = 8; // Offset of the first resonance into the parameter index array
13 // Offset is number of parameters, constant index, indices for tau, xmix, and ymix, index
14 // of resolution function, and finally number of resonances (not calculable from nP
15 // because we don't know what the efficiency and time resolution might need). Efficiency
16 // and time-resolution parameters are after the resonance information.
17 const unsigned int SPECIAL_RESOLUTION_FLAG = 999999999;
18 
19 // The function of this array is to hold all the cached waves; specific
20 // waves are recalculated when the corresponding resonance mass or width
21 // changes. Note that in a multithread environment each thread needs its
22 // own cache, hence the '10'. Ten threads should be enough for anyone!
23 
24 // NOTE: only one set of wave holders is supported currently.
25 __device__ WaveHolder_s *cWaves[16];
26 
27 __device__ inline int parIndexFromResIndex(int resIndex) { return resonanceOffset + resIndex * resonanceSize; }
28 
29 __device__ fpcomplex
30 getResonanceAmplitude(fptype m12, fptype m13, fptype m23, unsigned int functionIdx, unsigned int pIndex) {
31  auto func = reinterpret_cast<resonance_function_ptr>(device_function_table[functionIdx]);
32  return (*func)(m12, m13, m23, paramIndices + pIndex);
33 }
34 
35 __device__ ThreeComplex
36 device_Tddp_calcIntegrals(fptype m12, fptype m13, int res_i, int res_j, fptype *p, unsigned int *indices) {
37  // For calculating Dalitz-plot integrals. What's needed is the products
38  // AiAj*, AiBj*, and BiBj*, where
39  // Ai = BW_i(x, y) + BW_i(y, x)
40  // and Bi reverses the sign of the second BW.
41  // This function returns the above values at a single point.
42  // NB: Multiplication by efficiency is done by the calling function.
43  // Note that this function expects
44  // to be called on a normalisation grid, not on
45  // observed points, that's why it doesn't use
46  // cWaves. No need to cache the values at individual
47  // grid points - we only care about totals.
48 
49  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
50  fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
51  fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
52  fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
53 
54  ThreeComplex ret;
55 
56  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
57  return ret;
58 
59  fptype m23
60  = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
61 
62  int parameter_i = parIndexFromResIndex(res_i);
63  int parameter_j = parIndexFromResIndex(res_j);
64 
65  // fptype amp_real = p[indices[parameter_i+0]];
66  // fptype amp_imag = p[indices[parameter_i+1]];
67  unsigned int functn_i = RO_CACHE(indices[parameter_i + 2]);
68  unsigned int params_i = RO_CACHE(indices[parameter_i + 3]);
69  fpcomplex ai = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
70  fpcomplex bi = getResonanceAmplitude(m13, m12, m23, functn_i, params_i);
71 
72  unsigned int functn_j = RO_CACHE(indices[parameter_j + 2]);
73  unsigned int params_j = RO_CACHE(indices[parameter_j + 3]);
74  fpcomplex aj = conj(getResonanceAmplitude(m12, m13, m23, functn_j, params_j));
75  fpcomplex bj = conj(getResonanceAmplitude(m13, m12, m23, functn_j, params_j));
76 
77  ret = ThreeComplex(
78  (ai * aj).real(), (ai * aj).imag(), (ai * bj).real(), (ai * bj).imag(), (bi * bj).real(), (bi * bj).imag());
79  return ret;
80 }
81 
82 __device__ fptype device_Tddp(fptype *evt, fptype *p, unsigned int *indices) {
83  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
84  fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
85  fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
86  fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
87 
88  fptype m12 = RO_CACHE(evt[RO_CACHE(indices[4 + RO_CACHE(indices[0])])]);
89  fptype m13 = RO_CACHE(evt[RO_CACHE(indices[5 + RO_CACHE(indices[0])])]);
90 
91  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
92  return 0;
93 
94  auto evtNum = static_cast<int>(floor(0.5 + RO_CACHE(evt[indices[6 + RO_CACHE(indices[0])]])));
95 
96  fpcomplex sumWavesA(0, 0);
97  fpcomplex sumWavesB(0, 0);
98  fpcomplex sumRateAA(0, 0);
99  fpcomplex sumRateAB(0, 0);
100  fpcomplex sumRateBB(0, 0);
101 
102  unsigned int numResonances = RO_CACHE(indices[6]);
103  // unsigned int cacheToUse = RO_CACHE(indices[7]);
104 
105  for(int i = 0; i < numResonances; ++i) {
106  int paramIndex = parIndexFromResIndex(i);
107  fpcomplex amp{RO_CACHE(p[RO_CACHE(indices[paramIndex + 0])]), RO_CACHE(p[RO_CACHE(indices[paramIndex + 1])])};
108 
109  // fpcomplex matrixelement(thrust::get<0>(cWaves[cacheToUse][evtNum*numResonances + i]),
110  // thrust::get<1>(cWaves[cacheToUse][evtNum*numResonances + i]));
111  // Note, to make this more efficient we should change it to only an array of fptype's, and read double2 at a
112  // time.
113  fpcomplex ai{RO_CACHE(cWaves[i][evtNum].ai_real), RO_CACHE(cWaves[i][evtNum].ai_imag)};
114  fpcomplex bi{RO_CACHE(cWaves[i][evtNum].bi_real), RO_CACHE(cWaves[i][evtNum].bi_imag)};
115 
116  fpcomplex matrixelement = ai * amp;
117  sumWavesA += matrixelement;
118 
119  // matrixelement = fpcomplex(thrust::get<2>(cWaves[cacheToUse][evtNum*numResonances + i]),
120  // thrust::get<3>(cWaves[cacheToUse][evtNum*numResonances + i]));
121  matrixelement = bi * amp;
122  sumWavesB += matrixelement;
123  }
124 
125  fptype _tau = RO_CACHE(p[RO_CACHE(indices[2])]);
126  fptype _xmixing = RO_CACHE(p[RO_CACHE(indices[3])]);
127  fptype _ymixing = RO_CACHE(p[RO_CACHE(indices[4])]);
128 
129  fptype _time = RO_CACHE(evt[RO_CACHE(indices[2 + RO_CACHE(indices[0])])]);
130  fptype _sigma = RO_CACHE(evt[RO_CACHE(indices[3 + RO_CACHE(indices[0])])]);
131 
132  // if ((gpuDebug & 1) && (0 == BLOCKIDX) && (0 == THREADIDX))
133  // if (0 == evtNum) printf("TDDP: (%f, %f) (%f, %f)\n", sumWavesA.real, sumWavesA.imag, sumWavesB.real,
134  // sumWavesB.imag);
135  // printf("TDDP: %f %f %f %f | %f %f %i\n", m12, m13, _time, _sigma, _xmixing, _tau, evtNum);
136 
137  /*
138  fptype ret = 0;
139  ret += (norm2(sumWavesA) + norm2(sumWavesB))*cosh(_ymixing * _time);
140  ret += (norm2(sumWavesA) - norm2(sumWavesB))*cos (_xmixing * _time);
141  sumWavesA *= conj(sumWavesB);
142  ret -= 2*sumWavesA.real * sinh(_ymixing * _time);
143  ret -= 2*sumWavesA.imag * sin (_xmixing * _time); // Notice sign difference wrt to Mikhail's code, because I have
144  AB* and he has A*B.
145  ret *= exp(-_time);
146  */
147 
148  fptype term1 = thrust::norm(sumWavesA) + thrust::norm(sumWavesB);
149  fptype term2 = thrust::norm(sumWavesA) - thrust::norm(sumWavesB);
150  sumWavesA *= conj(sumWavesB);
151  // printf("(%i, %i) TDDP: %f %f %f %f %f %f %f\n", BLOCKIDX, THREADIDX, term1, term2, sumWavesA.real,
152  // sumWavesA.imag, m12, m13, _tau);
153 
154  // Cannot use callFunction on resolution function.
155  int effFunctionIdx = parIndexFromResIndex(numResonances);
156  int resFunctionIdx = RO_CACHE(indices[5]);
157  int resFunctionPar = 2 + effFunctionIdx;
158  fptype ret = 0;
159  int md0_offset = 0;
160 
161  if(resFunctionIdx == SPECIAL_RESOLUTION_FLAG) {
162  // In this case there are multiple resolution functions, they are stored after the efficiency function,
163  // and which one we use depends on the measured mother-particle mass.
164  md0_offset = 1;
165  fptype massd0 = RO_CACHE(evt[indices[7 + RO_CACHE(indices[0])]]);
166  fptype minMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 6]);
167  fptype md0Step = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 7]);
168  int res_to_use = (massd0 <= minMass) ? 0 : static_cast<int>(floor((massd0 - minMass) / md0Step));
169  int maxFcn = RO_CACHE(indices[2 + effFunctionIdx]);
170 
171  if(res_to_use > maxFcn)
172  res_to_use = maxFcn;
173 
174  // Now calculate index of resolution function.
175  // At the end of the array are indices efficiency_function, efficiency_parameters, maxFcn, res_function_1,
176  // res_function_1_nP, par1, par2 ... res_function_2, res_function_2_nP, ...
177  res_to_use = 3 + effFunctionIdx + res_to_use * (2 + RO_CACHE(indices[effFunctionIdx + 4]));
178  // NB this assumes all resolution functions have the same number of parameters. The number
179  // of parameters in the first resolution function is stored in effFunctionIdx+3; add one to
180  // account for the index of the resolution function itself in the device function table, one
181  // to account for the number-of-parameters index, and this is the space taken up by each
182  // resolution function. Multiply by res_to_use to get the number of spaces to skip to get to
183  // the one we want.
184 
185  resFunctionIdx = RO_CACHE(indices[res_to_use]);
186  resFunctionPar = res_to_use + 1;
187  }
188 
189  ret = (*(reinterpret_cast<device_resfunction_ptr>(device_function_table[resFunctionIdx])))(term1,
190  term2,
191  sumWavesA.real(),
192  sumWavesA.imag(),
193  _tau,
194  _time,
195  _xmixing,
196  _ymixing,
197  _sigma,
198  p,
199  indices
200  + resFunctionPar);
201 
202  // For the reversed (mistagged) fraction, we make the
203  // interchange A <-> B. So term1 stays the same,
204  // term2 changes sign, and AB* becomes BA*.
205  // Efficiency remains the same for the mistagged part,
206  // because it depends on the momenta of the pi+ and pi-,
207  // which don't change even though we tagged a D0 as D0bar.
208 
209  fptype mistag = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 5]);
210 
211  if(mistag > 0) { // This should be either true or false for all events, so no branch is caused.
212  // See header file for explanation of 'mistag' variable - it is actually the probability
213  // of having the correct sign, given that we have a correctly reconstructed D meson.
214  mistag = RO_CACHE(evt[RO_CACHE(indices[md0_offset + 7 + RO_CACHE(indices[0])])]);
215  ret *= mistag;
216  ret += (1 - mistag)
217  * (*(reinterpret_cast<device_resfunction_ptr>(device_function_table[resFunctionIdx])))(
218  term1,
219  -term2,
220  sumWavesA.real(),
221  -sumWavesA.imag(),
222  _tau,
223  _time,
224  _xmixing,
225  _ymixing,
226  _sigma,
227  p,
228  &(indices[resFunctionPar]));
229  }
230 
231  fptype eff = callFunction(evt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1]));
232  // internalDebug = 0;
233  ret *= eff;
234 
235  return ret;
236 }
237 
238 __device__ device_function_ptr ptr_to_Tddp = device_Tddp;
239 
240 __host__ TddpPdf::TddpPdf(std::string n,
241  Observable _dtime,
242  Observable _sigmat,
243  Observable m12,
244  Observable m13,
245  EventNumber eventNumber,
246  DecayInfo3t decay,
247  MixingTimeResolution *r,
248  GooPdf *efficiency,
249  Observable *mistag)
250  : GooPdf(n, _dtime, _sigmat, m12, m13, eventNumber)
251  , decayInfo(decay)
252  , _m12(m12)
253  , _m13(m13)
254  , resolution(r)
255  , totalEventSize(5) // Default 5 = m12, m13, time, sigma_t, evtNum
256 {
257  for(auto &cachedWave : cachedWaves)
258  cachedWave = nullptr;
259 
260  fptype decayConstants[6];
261  decayConstants[5] = 0;
262 
263  if(mistag) {
264  registerObservable(*mistag);
265  totalEventSize = 6;
266  decayConstants[5] = 1; // Flags existence of mistag
267  }
268 
269  std::vector<unsigned int> pindices;
270  pindices.push_back(registerConstants(6));
271  decayConstants[0] = decayInfo.motherMass;
272  decayConstants[1] = decayInfo.daug1Mass;
273  decayConstants[2] = decayInfo.daug2Mass;
274  decayConstants[3] = decayInfo.daug3Mass;
275  decayConstants[4] = decayInfo.meson_radius;
276  MEMCPY_TO_SYMBOL(
277  functorConstants, decayConstants, 6 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
278 
279  pindices.push_back(registerParameter(decayInfo._tau));
280  pindices.push_back(registerParameter(decayInfo._xmixing));
281  pindices.push_back(registerParameter(decayInfo._ymixing));
282  if(resolution->getDeviceFunction() < 0)
283  throw GooFit::GeneralError("The resolution device function index {} must be more than 0",
284  resolution->getDeviceFunction());
285  pindices.push_back(static_cast<unsigned int>(resolution->getDeviceFunction()));
286  pindices.push_back(decayInfo.resonances.size());
287 
288  static int cacheCount = 0;
289  cacheToUse = cacheCount++;
290  pindices.push_back(cacheToUse);
291 
292  for(auto &resonance : decayInfo.resonances) {
293  pindices.push_back(registerParameter(resonance->amp_real));
294  pindices.push_back(registerParameter(resonance->amp_imag));
295  pindices.push_back(resonance->getFunctionIndex());
296  pindices.push_back(resonance->getParameterIndex());
297  resonance->setConstantIndex(cIndex);
298  components.push_back(resonance);
299  }
300 
301  pindices.push_back(efficiency->getFunctionIndex());
302  pindices.push_back(efficiency->getParameterIndex());
303  components.push_back(efficiency);
304 
305  resolution->createParameters(pindices, this);
306  GET_FUNCTION_ADDR(ptr_to_Tddp);
307  initialize(pindices);
308 
309  redoIntegral = new bool[decayInfo.resonances.size()];
310  cachedMasses = new fptype[decayInfo.resonances.size()];
311  cachedWidths = new fptype[decayInfo.resonances.size()];
312  integrals = new ThreeComplex **[decayInfo.resonances.size()];
313  integrators = new SpecialDalitzIntegrator **[decayInfo.resonances.size()];
314  calculators = new SpecialWaveCalculator *[decayInfo.resonances.size()];
315 
316  for(int i = 0; i < decayInfo.resonances.size(); ++i) {
317  redoIntegral[i] = true;
318  cachedMasses[i] = -1;
319  cachedWidths[i] = -1;
320  integrators[i] = new SpecialDalitzIntegrator *[decayInfo.resonances.size()];
321  calculators[i] = new SpecialWaveCalculator(parameters, i);
322  integrals[i] = new ThreeComplex *[decayInfo.resonances.size()];
323 
324  for(int j = 0; j < decayInfo.resonances.size(); ++j) {
325  integrals[i][j] = new ThreeComplex(0, 0, 0, 0, 0, 0);
326  integrators[i][j] = new SpecialDalitzIntegrator(parameters, i, j);
327  }
328  }
329 
330  addSpecialMask(PdfBase::ForceSeparateNorm);
331 }
332 
333 __host__ TddpPdf::TddpPdf(std::string n,
334  Observable _dtime,
335  Observable _sigmat,
336  Observable m12,
337  Observable m13,
338  EventNumber eventNumber,
339  DecayInfo3t decay,
340  std::vector<MixingTimeResolution *> &r,
341  GooPdf *efficiency,
342  Observable md0,
343  Observable *mistag)
344  : GooPdf(n, _dtime, _sigmat, m12, m13, eventNumber, md0)
345  , decayInfo(decay)
346  , _m12(m12)
347  , _m13(m13)
348  , resolution(
349  r[0]) // Only used for normalisation, which only depends on x and y - it doesn't matter which one we use.
350  , totalEventSize(6) // This case adds the D0 mass by default.
351 {
352  for(auto &cachedWave : cachedWaves)
353  cachedWave = nullptr;
354 
355  fptype decayConstants[8];
356  decayConstants[5] = 0;
357  decayConstants[6] = md0.getLowerLimit();
358  decayConstants[7] = (md0.getUpperLimit() - md0.getLowerLimit()) / r.size();
359 
360  if(mistag) {
361  registerObservable(*mistag);
362  totalEventSize++;
363  decayConstants[5] = 1; // Flags existence of mistag
364  }
365 
366  std::vector<unsigned int> pindices;
367  pindices.push_back(registerConstants(8));
368  decayConstants[0] = decayInfo.motherMass;
369  decayConstants[1] = decayInfo.daug1Mass;
370  decayConstants[2] = decayInfo.daug2Mass;
371  decayConstants[3] = decayInfo.daug3Mass;
372  decayConstants[4] = decayInfo.meson_radius;
373  MEMCPY_TO_SYMBOL(
374  functorConstants, decayConstants, 8 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
375 
376  pindices.push_back(registerParameter(decayInfo._tau));
377  pindices.push_back(registerParameter(decayInfo._xmixing));
378  pindices.push_back(registerParameter(decayInfo._ymixing));
379  pindices.push_back(SPECIAL_RESOLUTION_FLAG); // Flag existence of multiple resolution functions.
380  pindices.push_back(decayInfo.resonances.size());
381 
382  static int cacheCount = 0;
383  cacheToUse = cacheCount++;
384  pindices.push_back(cacheToUse);
385 
386  for(auto &resonance : decayInfo.resonances) {
387  pindices.push_back(registerParameter(resonance->amp_real));
388  pindices.push_back(registerParameter(resonance->amp_imag));
389  pindices.push_back(resonance->getFunctionIndex());
390  pindices.push_back(resonance->getParameterIndex());
391  resonance->setConstantIndex(cIndex);
392  components.push_back(resonance);
393  }
394 
395  pindices.push_back(efficiency->getFunctionIndex());
396  pindices.push_back(efficiency->getParameterIndex());
397  components.push_back(efficiency);
398 
399  pindices.push_back(r.size() - 1); // Highest index, not number of functions.
400 
401  for(auto &i : r) {
402  if(i->getDeviceFunction() < 0)
403  throw GooFit::GeneralError("Device function index {} must be more than 0", i->getDeviceFunction());
404  pindices.push_back(static_cast<unsigned int>(i->getDeviceFunction()));
405  i->createParameters(pindices, this);
406  }
407 
408  GET_FUNCTION_ADDR(ptr_to_Tddp);
409  initialize(pindices);
410 
411  redoIntegral = new bool[decayInfo.resonances.size()];
412  cachedMasses = new fptype[decayInfo.resonances.size()];
413  cachedWidths = new fptype[decayInfo.resonances.size()];
414  integrals = new ThreeComplex **[decayInfo.resonances.size()];
415  integrators = new SpecialDalitzIntegrator **[decayInfo.resonances.size()];
416  calculators = new SpecialWaveCalculator *[decayInfo.resonances.size()];
417 
418  for(int i = 0; i < decayInfo.resonances.size(); ++i) {
419  redoIntegral[i] = true;
420  cachedMasses[i] = -1;
421  cachedWidths[i] = -1;
422  integrators[i] = new SpecialDalitzIntegrator *[decayInfo.resonances.size()];
423  calculators[i] = new SpecialWaveCalculator(parameters, i);
424  integrals[i] = new ThreeComplex *[decayInfo.resonances.size()];
425 
426  for(int j = 0; j < decayInfo.resonances.size(); ++j) {
427  integrals[i][j] = new ThreeComplex(0, 0, 0, 0, 0, 0);
428  integrators[i][j] = new SpecialDalitzIntegrator(parameters, i, j);
429  }
430  }
431 
432  addSpecialMask(PdfBase::ForceSeparateNorm);
433 }
434 
435 __host__ void TddpPdf::setDataSize(unsigned int dataSize, unsigned int evtSize) {
436  // Default 5 is m12, m13, time, sigma_t, evtNum
437  totalEventSize = evtSize;
438  if(totalEventSize < 5)
439  throw GooFit::GeneralError("totalEventSize {} must be 5 or more", totalEventSize);
440 
441  if(cachedWaves[0]) {
442  for(auto &cachedWave : cachedWaves)
443  delete cachedWave;
444  }
445 
446  numEntries = dataSize;
447 
448 // Ideally this would not be required, this would be called AFTER setData which will set m_iEventsPerTask
449 #ifdef GOOFIT_MPI
450  int myId, numProcs;
451  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
452  MPI_Comm_rank(MPI_COMM_WORLD, &myId);
453 
454  int perTask = numEntries / numProcs;
455 
456  int *counts = new int[numProcs];
457 
458  for(int i = 0; i < numProcs - 1; i++)
459  counts[i] = perTask;
460 
461  counts[numProcs - 1] = numEntries - perTask * (numProcs - 1);
462 
463  setNumPerTask(this, counts[myId]);
464 
465  delete[] counts;
466 #endif
467 
468  for(int i = 0; i < 16; i++) {
469 #ifdef GOOFIT_MPI
470  cachedWaves[i] = new thrust::device_vector<WaveHolder_s>(m_iEventsPerTask);
471 #else
472  cachedWaves[i] = new thrust::device_vector<WaveHolder_s>(dataSize);
473 #endif
474  void *dummy = thrust::raw_pointer_cast(cachedWaves[i]->data());
475  MEMCPY_TO_SYMBOL(cWaves, &dummy, sizeof(WaveHolder_s *), i * sizeof(WaveHolder_s *), cudaMemcpyHostToDevice);
476  }
477 
478  setForceIntegrals();
479 }
480 
481 __host__ fptype TddpPdf::normalize() const {
482  recursiveSetNormalisation(1); // Not going to normalize efficiency,
483  // so set normalisation factor to 1 so it doesn't get multiplied by zero.
484  // Copy at this time to ensure that the SpecialWaveCalculators, which need the efficiency,
485  // don't get zeroes through multiplying by the normFactor.
486  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
487  // std::cout << "TDDP normalisation " << getName() << std::endl;
488 
489  int totalBins = _m12.getNumBins() * _m13.getNumBins();
490 
491  if(!dalitzNormRange) {
492  gooMalloc((void **)&dalitzNormRange, 6 * sizeof(fptype));
493 
494  auto *host_norms = new fptype[6];
495  host_norms[0] = _m12.getLowerLimit();
496  host_norms[1] = _m12.getUpperLimit();
497  host_norms[2] = _m12.getNumBins();
498  host_norms[3] = _m13.getLowerLimit();
499  host_norms[4] = _m13.getUpperLimit();
500  host_norms[5] = _m13.getNumBins();
501  MEMCPY(dalitzNormRange, host_norms, 6 * sizeof(fptype), cudaMemcpyHostToDevice);
502  delete[] host_norms;
503  }
504 
505  for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
506  redoIntegral[i] = forceRedoIntegrals;
507 
508  if(!(decayInfo.resonances[i]->parametersChanged()))
509  continue;
510 
511  redoIntegral[i] = true;
512  }
513 
514  forceRedoIntegrals = false;
515 
516  // Only do this bit if masses or widths have changed.
517  thrust::constant_iterator<fptype *> arrayAddress(dalitzNormRange);
518  thrust::counting_iterator<int> binIndex(0);
519 
520  // NB, SpecialWaveCalculator assumes that fit is unbinned!
521  // And it needs to know the total event size, not just observables
522  // for this particular PDF component.
523  thrust::constant_iterator<fptype *> dataArray(dev_event_array);
524  thrust::constant_iterator<int> eventSize(totalEventSize);
525  thrust::counting_iterator<int> eventIndex(0);
526 
527  static int normCall = 0;
528  normCall++;
529 
530  for(int i = 0; i < decayInfo.resonances.size(); ++i) {
531  if(redoIntegral[i]) {
532 #ifdef GOOFIT_MPI
533  thrust::transform(
534  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
535  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
536  strided_range<thrust::device_vector<WaveHolder_s>::iterator>(
537  cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
538  .begin(),
539  *(calculators[i]));
540 #else
541  thrust::transform(
542  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
543  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
544  strided_range<thrust::device_vector<WaveHolder_s>::iterator>(
545  cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
546  .begin(),
547  *(calculators[i]));
548 #endif
549  // std::cout << "Integral for resonance " << i << " " << numEntries << " " << totalEventSize << std::endl;
550  }
551 
552  // Possibly this can be done more efficiently by exploiting symmetry?
553  for(int j = 0; j < decayInfo.resonances.size(); ++j) {
554  if((!redoIntegral[i]) && (!redoIntegral[j]))
555  continue;
556 
557  ThreeComplex dummy(0, 0, 0, 0, 0, 0);
558  SpecialComplexSum complexSum;
559  (*(integrals[i][j])) = thrust::transform_reduce(
560  thrust::make_zip_iterator(thrust::make_tuple(binIndex, arrayAddress)),
561  thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, arrayAddress)),
562  *(integrators[i][j]),
563  dummy,
564  complexSum);
565  /*
566  std::cout << "With resonance " << j << ": "
567  << thrust::get<0>(*(integrals[i][j])) << " "
568  << thrust::get<1>(*(integrals[i][j])) << " "
569  << thrust::get<2>(*(integrals[i][j])) << " "
570  << thrust::get<3>(*(integrals[i][j])) << " "
571  << thrust::get<4>(*(integrals[i][j])) << " "
572  << thrust::get<5>(*(integrals[i][j])) << std::endl;
573  */
574  }
575  }
576 
577  // End of time-consuming integrals.
578 
579  fpcomplex integralA_2(0, 0);
580  fpcomplex integralB_2(0, 0);
581  fpcomplex integralABs(0, 0);
582 
583  for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
584  int param_i = parameters + resonanceOffset + resonanceSize * i;
585  fpcomplex amplitude_i(host_params[host_indices[param_i]], host_params[host_indices[param_i + 1]]);
586 
587  for(unsigned int j = 0; j < decayInfo.resonances.size(); ++j) {
588  int param_j = parameters + resonanceOffset + resonanceSize * j;
589  fpcomplex amplitude_j(host_params[host_indices[param_j]],
590  -host_params[host_indices[param_j + 1]]); // Notice complex conjugation
591 
592  integralA_2 += (amplitude_i * amplitude_j
593  * fpcomplex(thrust::get<0>(*(integrals[i][j])), thrust::get<1>(*(integrals[i][j]))));
594  integralABs += (amplitude_i * amplitude_j
595  * fpcomplex(thrust::get<2>(*(integrals[i][j])), thrust::get<3>(*(integrals[i][j]))));
596  integralB_2 += (amplitude_i * amplitude_j
597  * fpcomplex(thrust::get<4>(*(integrals[i][j])), thrust::get<5>(*(integrals[i][j]))));
598 
599  /*
600  if (cpuDebug & 1) {
601  int idx = i * decayInfo.resonances.size() + j;
602  if (0 == host_callnumber) std::cout << "Integral contribution " << i << ", " << j << " " << idx << " : "
603  << amplitude_i << " "
604  << amplitude_j << " ("
605  << real(amplitude_i * amplitude_j * complex<fptype>(thrust::get<0>(*(integrals[i][j])),
606  thrust::get<1>(*(integrals[i][j])))) << ", "
607  << imag(amplitude_i * amplitude_j * complex<fptype>(thrust::get<0>(*(integrals[i][j])),
608  thrust::get<1>(*(integrals[i][j])))) << ") ("
609  << real(amplitude_i * amplitude_j * complex<fptype>(thrust::get<2>(*(integrals[i][j])),
610  thrust::get<3>(*(integrals[i][j])))) << ", "
611  << imag(amplitude_i * amplitude_j * complex<fptype>(thrust::get<2>(*(integrals[i][j])),
612  thrust::get<3>(*(integrals[i][j])))) << ") ("
613  << real(amplitude_i * amplitude_j * complex<fptype>(thrust::get<4>(*(integrals[i][j])),
614  thrust::get<5>(*(integrals[i][j])))) << ", "
615  << imag(amplitude_i * amplitude_j * complex<fptype>(thrust::get<4>(*(integrals[i][j])),
616  thrust::get<5>(*(integrals[i][j])))) << ") "
617  << thrust::get<0>(*(integrals[i][j])) << ", "
618  << thrust::get<1>(*(integrals[i][j])) << ") ("
619  << thrust::get<2>(*(integrals[i][j])) << ", "
620  << thrust::get<3>(*(integrals[i][j])) << ") ("
621  << thrust::get<4>(*(integrals[i][j])) << ", "
622  << thrust::get<5>(*(integrals[i][j])) << ") ("
623  << real(integralA_2) << ", " << imag(integralA_2) << ") "
624  << std::endl;
625  }
626  */
627  }
628  }
629 
630  double dalitzIntegralOne = integralA_2.real(); // Notice that this is already the abs2, so it's real by
631  // construction; but the compiler doesn't know that.
632  double dalitzIntegralTwo = integralB_2.real();
633  double dalitzIntegralThr = integralABs.real();
634  double dalitzIntegralFou = integralABs.imag();
635 
636  fptype tau = host_params[host_indices[parameters + 2]];
637  fptype xmixing = host_params[host_indices[parameters + 3]];
638  fptype ymixing = host_params[host_indices[parameters + 4]];
639 
640  fptype ret = resolution->normalisation(
641  dalitzIntegralOne, dalitzIntegralTwo, dalitzIntegralThr, dalitzIntegralFou, tau, xmixing, ymixing);
642 
643  double binSizeFactor = 1;
644  binSizeFactor *= ((_m12.getUpperLimit() - _m12.getLowerLimit()) / _m12.getNumBins());
645  binSizeFactor *= ((_m13.getUpperLimit() - _m13.getLowerLimit()) / _m13.getNumBins());
646  ret *= binSizeFactor;
647 
648  host_normalisation[parameters] = 1.0 / ret;
649  // std::cout << "End of TDDP normalisation: " << ret << " " << host_normalisation[parameters] << " " <<
650  // binSizeFactor << std::endl;
651  return ret;
652 }
653 //#endif
654 
655 SpecialDalitzIntegrator::SpecialDalitzIntegrator(int pIdx, unsigned int ri, unsigned int rj)
656  : resonance_i(ri)
657  , resonance_j(rj)
658  , parameters(pIdx) {}
659 
660 __device__ ThreeComplex SpecialDalitzIntegrator::operator()(thrust::tuple<int, fptype *> t) const {
661  // Bin index, base address [lower, upper,getNumBins]
662  // Notice that this is basically MetricTaker::operator (binned) with the special-case knowledge
663  // that event size is two, and that the function to call is dev_Tddp_calcIntegrals.
664 
665  int globalBinNumber = thrust::get<0>(t);
666  fptype lowerBoundM12 = thrust::get<1>(t)[0];
667  fptype upperBoundM12 = thrust::get<1>(t)[1];
668  auto numBinsM12 = static_cast<int>(floor(thrust::get<1>(t)[2] + 0.5));
669  int binNumberM12 = globalBinNumber % numBinsM12;
670  fptype binCenterM12 = upperBoundM12 - lowerBoundM12;
671  binCenterM12 /= numBinsM12;
672  binCenterM12 *= (binNumberM12 + 0.5);
673  binCenterM12 += lowerBoundM12;
674 
675  globalBinNumber /= numBinsM12;
676  fptype lowerBoundM13 = thrust::get<1>(t)[3];
677  fptype upperBoundM13 = thrust::get<1>(t)[4];
678  auto numBinsM13 = static_cast<int>(floor(thrust::get<1>(t)[5] + 0.5));
679  fptype binCenterM13 = upperBoundM13 - lowerBoundM13;
680  binCenterM13 /= numBinsM13;
681  binCenterM13 *= (globalBinNumber + 0.5);
682  binCenterM13 += lowerBoundM13;
683 
684  // if (0 == THREADIDX) cuPrintf("%i %i %i %f %f operator\n", thrust::get<0>(t), thrust::get<0>(t) % numBinsM12,
685  // globalBinNumber, binCenterM12, binCenterM13);
686  unsigned int *indices = paramIndices + parameters;
687  ThreeComplex ret
688  = device_Tddp_calcIntegrals(binCenterM12, binCenterM13, resonance_i, resonance_j, cudaArray, indices);
689 
690  fptype fakeEvt[10]; // Need room for many observables in case m12 or m13 were assigned a high index in an
691  // event-weighted fit.
692  fakeEvt[RO_CACHE(indices[RO_CACHE(indices[0]) + 2 + 2])] = binCenterM12;
693  fakeEvt[RO_CACHE(indices[RO_CACHE(indices[0]) + 2 + 3])] = binCenterM13;
694  unsigned int numResonances = indices[6];
695  int effFunctionIdx = parIndexFromResIndex(numResonances);
696  // if (thrust::get<0>(t) == 19840) {internalDebug1 = BLOCKIDX; internalDebug2 = THREADIDX;}
697  // fptype eff = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[effFunctionIdx]])))(fakeEvt,
698  // cudaArray, paramIndices + indices[effFunctionIdx + 1]);
699  fptype eff = callFunction(fakeEvt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1]));
700  // if (thrust::get<0>(t) == 19840) {
701  // internalDebug1 = -1;
702  // internalDebug2 = -1;
703  // printf("Efficiency: %i %f %f %f %i\n", thrust::get<0>(t), binCenterM12, binCenterM13, eff, effFunctionIdx);
704  // printf("Efficiency: %f %f %f %f %f %i %i\n", fakeEvt[0], fakeEvt[1], fakeEvt[2], fakeEvt[3], fakeEvt[4],
705  // indices[indices[0] + 2 + 2], indices[indices[0] + 2 + 3]);
706  //}
707 
708  // Multiplication by eff, not sqrt(eff), is correct:
709  // These complex numbers will not be squared when they
710  // go into the integrals. They've been squared already,
711  // as it were.
712  thrust::get<0>(ret) *= eff;
713  thrust::get<1>(ret) *= eff;
714  thrust::get<2>(ret) *= eff;
715  thrust::get<3>(ret) *= eff;
716  thrust::get<4>(ret) *= eff;
717  thrust::get<5>(ret) *= eff;
718  return ret;
719 }
720 
721 SpecialWaveCalculator::SpecialWaveCalculator(int pIdx, unsigned int res_idx)
722  : resonance_i(res_idx)
723  , parameters(pIdx) {}
724 
725 __device__ WaveHolder_s SpecialWaveCalculator::operator()(thrust::tuple<int, fptype *, int> t) const {
726  // Calculates the BW values for a specific resonance.
727  // The 'A' wave stores the value at each point, the 'B'
728  // at the opposite (reversed) point.
729 
730  WaveHolder_s ret;
731  ret.ai_real = 0.0;
732  ret.ai_imag = 0.0;
733  ret.bi_real = 0.0;
734  ret.bi_imag = 0.0;
735 
736  int evtNum = thrust::get<0>(t);
737  fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
738 
739  unsigned int *indices = paramIndices + parameters; // Jump to TDDP position within parameters array
740  fptype m12 = RO_CACHE(evt[indices[4 + indices[0]]]);
741  fptype m13 = RO_CACHE(evt[indices[5 + indices[0]]]);
742 
743  fptype motherMass = functorConstants[indices[1] + 0];
744  fptype daug1Mass = functorConstants[indices[1] + 1];
745  fptype daug2Mass = functorConstants[indices[1] + 2];
746  fptype daug3Mass = functorConstants[indices[1] + 3];
747 
748  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
749  return ret;
750 
751  fptype m23
752  = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
753 
754  int parameter_i = parIndexFromResIndex(resonance_i); // Find position of this resonance relative to TDDP start
755  unsigned int functn_i = indices[parameter_i + 2];
756  unsigned int params_i = indices[parameter_i + 3];
757 
758  fpcomplex ai = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
759  fpcomplex bi = getResonanceAmplitude(m13, m12, m23, functn_i, params_i);
760 
761  // printf("Amplitudes %f, %f => (%f %f) (%f %f)\n", m12, m13, ai.real, ai.imag, bi.real, bi.imag);
762 
763  ret.ai_real = ai.real();
764  ret.ai_imag = ai.imag();
765  ret.bi_real = bi.real();
766  ret.bi_imag = bi.imag();
767 
768  return ret;
769 }
770 
771 } // namespace GooFit