GooFit  v2.1.3
DalitzPlotPdf.cu
Go to the documentation of this file.
1 #include <goofit/Error.h>
2 #include <goofit/PDFs/physics/DalitzPlotPdf.h>
3 #include <goofit/detail/Complex.h>
4 
5 #include <thrust/copy.h>
6 #include <thrust/transform_reduce.h>
7 
8 namespace GooFit {
9 
10 // Functor used for fit fraction sum
11 struct CoefSumFunctor {
12  fpcomplex coef_i;
13  fpcomplex coef_j;
14 
15  CoefSumFunctor(fpcomplex coef_i, fpcomplex coef_j)
16  : coef_i(coef_i)
17  , coef_j(coef_j) {}
18 
19  __device__ fptype operator()(thrust::tuple<fpcomplex, fpcomplex> val) {
20  return (coef_i * thrust::conj<fptype>(coef_j) * thrust::get<0>(val) * thrust::conj<fptype>(thrust::get<1>(val)))
21  .real();
22  }
23 };
24 
25 constexpr int resonanceOffset_DP = 4; // Offset of the first resonance into the parameter index array
26 // Offset is number of parameters, constant index, number of resonances (not calculable
27 // from nP because we don't know what the efficiency might need), and cache index. Efficiency
28 // parameters are after the resonance information.
29 
30 // The function of this array is to hold all the cached waves; specific
31 // waves are recalculated when the corresponding resonance mass or width
32 // changes. Note that in a multithread environment each thread needs its
33 // own cache, hence the '10'. Ten threads should be enough for anyone!
34 
35 // NOTE: This is does not support ten instances (ten threads) of resoncances now, only one set of resonances.
36 __device__ fpcomplex *cResonances[16];
37 
38 __device__ inline int parIndexFromResIndex_DP(int resIndex) { return resonanceOffset_DP + resIndex * resonanceSize; }
39 
40 __device__ fpcomplex
41 device_DalitzPlot_calcIntegrals(fptype m12, fptype m13, int res_i, int res_j, fptype *p, unsigned int *indices) {
42  // Calculates BW_i(m12, m13) * BW_j^*(m12, m13).
43  // This calculation is in a separate function so
44  // it can be cached. Note that this function expects
45  // to be called on a normalisation grid, not on
46  // observed points, that's why it doesn't use
47  // cResonances. No need to cache the values at individual
48  // grid points - we only care about totals.
49  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
50  fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
51  fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
52  fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
53 
54  fpcomplex ret{0., 0.};
55 
56  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
57  return ret;
58 
59  fptype m23
60  = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
61 
62  int parameter_i = parIndexFromResIndex_DP(res_i);
63  unsigned int functn_i = RO_CACHE(indices[parameter_i + 2]);
64  unsigned int params_i = RO_CACHE(indices[parameter_i + 3]);
65  ret = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
66 
67  int parameter_j = parIndexFromResIndex_DP(res_j);
68  unsigned int functn_j = RO_CACHE(indices[parameter_j + 2]);
69  unsigned int params_j = RO_CACHE(indices[parameter_j + 3]);
70  ret *= conj(getResonanceAmplitude(m12, m13, m23, functn_j, params_j));
71 
72  return ret;
73 }
74 
75 __device__ fptype device_DalitzPlot(fptype *evt, fptype *p, unsigned int *indices) {
76  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
77  fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
78  fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
79  fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
80 
81  fptype m12 = RO_CACHE(evt[RO_CACHE(indices[2 + RO_CACHE(indices[0])])]);
82  fptype m13 = RO_CACHE(evt[RO_CACHE(indices[3 + RO_CACHE(indices[0])])]);
83 
84  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
85  return 0;
86 
87  fptype evtIndex = RO_CACHE(evt[RO_CACHE(indices[4 + RO_CACHE(indices[0])])]);
88 
89  auto evtNum = static_cast<int>(floor(0.5 + evtIndex));
90 
91  fpcomplex totalAmp(0, 0);
92  unsigned int numResonances = RO_CACHE(indices[2]);
93  // unsigned int cacheToUse = RO_CACHE(indices[3]);
94 
95  for(int i = 0; i < numResonances; ++i) {
96  int paramIndex = parIndexFromResIndex_DP(i);
97  fpcomplex amp
98  = fpcomplex(RO_CACHE(p[RO_CACHE(indices[paramIndex + 0])]), RO_CACHE(p[RO_CACHE(indices[paramIndex + 1])]));
99 
100  // potential performance improvement by
101  // double2 me = RO_CACHE(reinterpret_cast<double2*> (cResonances[i][evtNum]));
102  fpcomplex me = RO_CACHE(cResonances[i][evtNum]);
103 
104  // fpcomplex matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
105  // (cResonances[cacheToUse][evtNum*numResonances + i]).imag);
106  // fpcomplex matrixelement (me[0], me[1]);
107 
108  totalAmp += amp * me;
109  }
110 
111  fptype ret = thrust::norm(totalAmp);
112  int effFunctionIdx = parIndexFromResIndex_DP(numResonances);
113  fptype eff = callFunction(evt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1]));
114  ret *= eff;
115 
116  // printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real,
117  // totalAmp.imag);
118 
119  return ret;
120 }
121 
122 __device__ device_function_ptr ptr_to_DalitzPlot = device_DalitzPlot;
123 
124 __host__ DalitzPlotPdf::DalitzPlotPdf(
125  std::string n, Observable m12, Observable m13, EventNumber eventNumber, DecayInfo3 decay, GooPdf *efficiency)
126  : GooPdf(n, m12, m13, eventNumber)
127  , decayInfo(decay)
128  , _m12(m12)
129  , _m13(m13)
130  , _eventNumber(eventNumber)
131  , dalitzNormRange(nullptr)
132  //, cachedWaves(0)
133  , integrals(nullptr)
134  , forceRedoIntegrals(true)
135  , totalEventSize(3) // Default 3 = m12, m13, evtNum
136  , cacheToUse(0)
137  , integrators(nullptr)
138  , calculators(nullptr) {
139  fptype decayConstants[5];
140 
141  for(auto &cachedWave : cachedWaves)
142  cachedWave = nullptr;
143 
144  std::vector<unsigned int> pindices;
145  pindices.push_back(registerConstants(5));
146  decayConstants[0] = decayInfo.motherMass;
147  decayConstants[1] = decayInfo.daug1Mass;
148  decayConstants[2] = decayInfo.daug2Mass;
149  decayConstants[3] = decayInfo.daug3Mass;
150  decayConstants[4] = decayInfo.meson_radius;
151  MEMCPY_TO_SYMBOL(
152  functorConstants, decayConstants, 5 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
153 
154  pindices.push_back(decayInfo.resonances.size());
155  static int cacheCount = 0;
156  cacheToUse = cacheCount++;
157  pindices.push_back(cacheToUse);
158 
159  for(auto &resonance : decayInfo.resonances) {
160  pindices.push_back(registerParameter(resonance->amp_real));
161  pindices.push_back(registerParameter(resonance->amp_imag));
162  pindices.push_back(resonance->getFunctionIndex());
163  pindices.push_back(resonance->getParameterIndex());
164  resonance->setConstantIndex(cIndex);
165  components.push_back(resonance);
166  }
167 
168  pindices.push_back(efficiency->getFunctionIndex());
169  pindices.push_back(efficiency->getParameterIndex());
170  components.push_back(efficiency);
171 
172  GET_FUNCTION_ADDR(ptr_to_DalitzPlot);
173  initialize(pindices);
174 
175  redoIntegral = new bool[decayInfo.resonances.size()];
176  cachedMasses = new fptype[decayInfo.resonances.size()];
177  cachedWidths = new fptype[decayInfo.resonances.size()];
178  integrals = new fpcomplex **[decayInfo.resonances.size()];
179  integrators = new SpecialResonanceIntegrator **[decayInfo.resonances.size()];
180  calculators = new SpecialResonanceCalculator *[decayInfo.resonances.size()];
181 
182  for(int i = 0; i < decayInfo.resonances.size(); ++i) {
183  redoIntegral[i] = true;
184  cachedMasses[i] = -1;
185  cachedWidths[i] = -1;
186  integrators[i] = new SpecialResonanceIntegrator *[decayInfo.resonances.size()];
187  calculators[i] = new SpecialResonanceCalculator(parameters, i);
188  integrals[i] = new fpcomplex *[decayInfo.resonances.size()];
189 
190  for(int j = 0; j < decayInfo.resonances.size(); ++j) {
191  integrals[i][j] = new fpcomplex(0, 0);
192  integrators[i][j] = new SpecialResonanceIntegrator(parameters, i, j);
193  }
194  }
195 
196  addSpecialMask(PdfBase::ForceSeparateNorm);
197 }
198 
199 __host__ void DalitzPlotPdf::setDataSize(unsigned int dataSize, unsigned int evtSize) {
200  // Default 3 is m12, m13, evtNum
201  totalEventSize = evtSize;
202  if(totalEventSize < 3)
203  throw GooFit::GeneralError("totalEventSize {} must be 3 or more", totalEventSize);
204 
205  // if (cachedWaves) delete cachedWaves;
206  if(cachedWaves[0]) {
207  for(auto &cachedWave : cachedWaves) {
208  delete cachedWave;
209  cachedWave = nullptr;
210  }
211  }
212 
213  numEntries = dataSize;
214 
215  for(int i = 0; i < 16; i++) {
216 #ifdef GOOFIT_MPI
217  cachedWaves[i] = new thrust::device_vector<fpcomplex>(m_iEventsPerTask);
218 #else
219  cachedWaves[i] = new thrust::device_vector<fpcomplex>(dataSize);
220 #endif
221  void *dummy = thrust::raw_pointer_cast(cachedWaves[i]->data());
222  MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(fpcomplex *), i * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
223  }
224 
225  setForceIntegrals();
226 }
227 
228 __host__ fptype DalitzPlotPdf::normalize() const {
229  recursiveSetNormalisation(1); // Not going to normalize efficiency,
230  // so set normalisation factor to 1 so it doesn't get multiplied by zero.
231  // Copy at this time to ensure that the SpecialResonanceCalculators, which need the efficiency,
232  // don't get zeroes through multiplying by the normFactor.
233  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
234 
235  int totalBins = _m12.getNumBins() * _m13.getNumBins();
236 
237  if(!dalitzNormRange) {
238  gooMalloc((void **)&dalitzNormRange, 6 * sizeof(fptype));
239  }
240 
241  // This line runs once
242  static std::array<fptype, 6> host_norms{{0, 0, 0, 0, 0, 0}};
243 
244  std::array<fptype, 6> current_host_norms{{_m12.getLowerLimit(),
245  _m12.getUpperLimit(),
246  static_cast<fptype>(_m12.getNumBins()),
247  _m13.getLowerLimit(),
248  _m13.getUpperLimit(),
249  static_cast<fptype>(_m13.getNumBins())}};
250 
251  if(host_norms != current_host_norms) {
252  host_norms = current_host_norms;
253  MEMCPY(dalitzNormRange, host_norms.data(), 6 * sizeof(fptype), cudaMemcpyHostToDevice);
254  }
255 
256  for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
257  redoIntegral[i] = forceRedoIntegrals;
258 
259  if(!(decayInfo.resonances[i]->parametersChanged()))
260  continue;
261 
262  redoIntegral[i] = true;
263  }
264 
265  forceRedoIntegrals = false;
266 
267  // Only do this bit if masses or widths have changed.
268  thrust::constant_iterator<fptype *> arrayAddress(dalitzNormRange);
269  thrust::counting_iterator<int> binIndex(0);
270 
271  // NB, SpecialResonanceCalculator assumes that fit is unbinned!
272  // And it needs to know the total event size, not just observables
273  // for this particular PDF component.
274  thrust::constant_iterator<fptype *> dataArray(dev_event_array);
275  thrust::constant_iterator<int> eventSize(totalEventSize);
276  thrust::counting_iterator<int> eventIndex(0);
277 
278  for(int i = 0; i < decayInfo.resonances.size(); ++i) {
279  if(redoIntegral[i]) {
280 #ifdef GOOFIT_MPI
281  thrust::transform(
282  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
283  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
284  strided_range<thrust::device_vector<fpcomplex>::iterator>(
285  cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
286  .begin(),
287  *(calculators[i]));
288 #else
289  thrust::transform(
290  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
291  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
292  strided_range<thrust::device_vector<fpcomplex>::iterator>(
293  cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
294  .begin(),
295  *(calculators[i]));
296 #endif
297  }
298 
299  // Possibly this can be done more efficiently by exploiting symmetry?
300  for(int j = 0; j < decayInfo.resonances.size(); ++j) {
301  if((!redoIntegral[i]) && (!redoIntegral[j]))
302  continue;
303 
304  fpcomplex dummy(0, 0);
305  thrust::plus<fpcomplex> complexSum;
306  (*(integrals[i][j])) = thrust::transform_reduce(
307  thrust::make_zip_iterator(thrust::make_tuple(binIndex, arrayAddress)),
308  thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, arrayAddress)),
309  *(integrators[i][j]),
310  dummy,
311  complexSum);
312  }
313  }
314 
315  // End of time-consuming integrals.
316  fpcomplex sumIntegral(0, 0);
317 
318  for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
319  int param_i = parameters + resonanceOffset_DP + resonanceSize * i;
320  fpcomplex amplitude_i(host_params[host_indices[param_i]], host_params[host_indices[param_i + 1]]);
321 
322  for(unsigned int j = 0; j < decayInfo.resonances.size(); ++j) {
323  int param_j = parameters + resonanceOffset_DP + resonanceSize * j;
324  fpcomplex amplitude_j(host_params[host_indices[param_j]], -host_params[host_indices[param_j + 1]]);
325  // Notice complex conjugation
326  // printf("%f %f %f %f %f %f\n", amplitude_i.real(), amplitude_i.imag(), amplitude_j.real(),
327  // amplitude_j.imag(), (*(integrals[i][j])).real(), (*(integrals[i][j])).imag() );
328  sumIntegral += amplitude_i * amplitude_j * (*(integrals[i][j]));
329  }
330  }
331 
332  fptype ret = sumIntegral.real(); // That complex number is a square, so it's fully real
333  double binSizeFactor = 1;
334  binSizeFactor *= _m12.getBinSize();
335  binSizeFactor *= _m13.getBinSize();
336  ret *= binSizeFactor;
337 
338  host_normalisation[parameters] = 1.0 / ret;
339  // printf("%f %f\n", ret, binSizeFactor);
340  return ret;
341 }
342 
343 __host__ fpcomplex DalitzPlotPdf::sumCachedWave(size_t i) const {
344  const thrust::device_vector<fpcomplex> &vec = getCachedWaveNoCopy(i);
345 
346  fpcomplex ret = thrust::reduce(vec.begin(), vec.end(), fpcomplex(0, 0), thrust::plus<fpcomplex>());
347 
348  return ret;
349 }
350 
351 __host__ const std::vector<std::complex<fptype>> DalitzPlotPdf::getCachedWave(size_t i) const {
352  auto ret_thrust = getCachedWave(i);
353  std::vector<std::complex<fptype>> ret(ret_thrust.size());
354  thrust::copy(ret_thrust.begin(), ret_thrust.end(), ret.begin());
355  return ret;
356 }
357 
358 __host__ std::vector<std::vector<fptype>> DalitzPlotPdf::fit_fractions() {
359  GOOFIT_DEBUG("Performing fit fraction calculation, should already have a cache (does not use normalization grid)");
360 
361  size_t n_res = getDecayInfo().resonances.size();
362  size_t nEntries = getCachedWaveNoCopy(0).size();
363 
364  std::vector<fpcomplex> coefs(n_res);
365  std::transform(getDecayInfo().resonances.begin(),
366  getDecayInfo().resonances.end(),
367  coefs.begin(),
368  [](ResonancePdf *res) { return fpcomplex(res->amp_real.getValue(), res->amp_imag.getValue()); });
369 
370  fptype buffer_all = 0;
371  fptype buffer;
372  fpcomplex coef_i;
373  fpcomplex coef_j;
374  fpcomplex cached_i_val;
375  fpcomplex cached_j_val;
376 
377  thrust::device_vector<fpcomplex> cached_i;
378  thrust::device_vector<fpcomplex> cached_j;
379  std::vector<std::vector<fptype>> Amps_int(n_res, std::vector<fptype>(n_res));
380 
381  for(size_t i = 0; i < n_res; i++) {
382  for(size_t j = 0; j < n_res; j++) {
383  buffer = 0;
384  cached_i = getCachedWaveNoCopy(i);
385  cached_j = getCachedWaveNoCopy(j);
386  coef_i = coefs[i];
387  coef_j = coefs[j];
388 
389  buffer += thrust::transform_reduce(
390  thrust::make_zip_iterator(thrust::make_tuple(cached_i.begin(), cached_j.begin())),
391  thrust::make_zip_iterator(thrust::make_tuple(cached_i.end(), cached_j.end())),
392  CoefSumFunctor(coef_i, coef_j),
393  (fptype)0.0,
394  thrust::plus<fptype>());
395 
396  buffer_all += buffer;
397  Amps_int[i][j] = (buffer / nEntries);
398  }
399  }
400 
401  fptype total_PDF = buffer_all / nEntries;
402 
403  std::vector<std::vector<fptype>> ff(n_res, std::vector<fptype>(n_res));
404 
405  for(size_t i = 0; i < n_res; i++)
406  for(size_t j = 0; j < n_res; j++)
407  ff[i][j] = (Amps_int[i][j] / total_PDF);
408 
409  return ff;
410 }
411 
412 SpecialResonanceIntegrator::SpecialResonanceIntegrator(int pIdx, unsigned int ri, unsigned int rj)
413  : resonance_i(ri)
414  , resonance_j(rj)
415  , parameters(pIdx) {}
416 
417 __device__ fpcomplex SpecialResonanceIntegrator::operator()(thrust::tuple<int, fptype *> t) const {
418  // Bin index, base address [lower, upper,getNumBins]
419  // Notice that this is basically MetricTaker::operator (binned) with the special-case knowledge
420  // that event size is two, and that the function to call is dev_DalitzPlot_calcIntegrals.
421 
422  int globalBinNumber = thrust::get<0>(t);
423  fptype lowerBoundM12 = thrust::get<1>(t)[0];
424  fptype upperBoundM12 = thrust::get<1>(t)[1];
425  auto numBinsM12 = static_cast<int>(floor(thrust::get<1>(t)[2] + 0.5));
426  int binNumberM12 = globalBinNumber % numBinsM12;
427  fptype binCenterM12 = upperBoundM12 - lowerBoundM12;
428  binCenterM12 /= numBinsM12;
429  binCenterM12 *= (binNumberM12 + 0.5);
430  binCenterM12 += lowerBoundM12;
431 
432  globalBinNumber /= numBinsM12;
433  fptype lowerBoundM13 = thrust::get<1>(t)[3];
434  fptype upperBoundM13 = thrust::get<1>(t)[4];
435  auto numBinsM13 = static_cast<int>(floor(thrust::get<1>(t)[5] + 0.5));
436  fptype binCenterM13 = upperBoundM13 - lowerBoundM13;
437  binCenterM13 /= numBinsM13;
438  binCenterM13 *= (globalBinNumber + 0.5);
439  binCenterM13 += lowerBoundM13;
440 
441  unsigned int *indices = paramIndices + parameters;
442  fpcomplex ret
443  = device_DalitzPlot_calcIntegrals(binCenterM12, binCenterM13, resonance_i, resonance_j, cudaArray, indices);
444 
445  fptype fakeEvt[10]; // Need room for many observables in case m12 or m13 were assigned a high index in an
446  // event-weighted fit.
447  fakeEvt[indices[indices[0] + 2 + 0]] = binCenterM12;
448  fakeEvt[indices[indices[0] + 2 + 1]] = binCenterM13;
449  unsigned int numResonances = indices[2];
450  int effFunctionIdx = parIndexFromResIndex_DP(numResonances);
451  fptype eff = callFunction(fakeEvt, indices[effFunctionIdx], indices[effFunctionIdx + 1]);
452 
453  // Multiplication by eff, not sqrt(eff), is correct:
454  // These complex numbers will not be squared when they
455  // go into the integrals. They've been squared already,
456  // as it were.
457  ret *= eff;
458  // printf("ret %f %f %f %f %f\n",binCenterM12, binCenterM13, ret.real, ret.imag, eff );
459  return ret;
460 }
461 
462 SpecialResonanceCalculator::SpecialResonanceCalculator(int pIdx, unsigned int res_idx)
463  : resonance_i(res_idx)
464  , parameters(pIdx) {}
465 
466 __device__ fpcomplex SpecialResonanceCalculator::operator()(thrust::tuple<int, fptype *, int> t) const {
467  // Calculates the BW values for a specific resonance.
468  fpcomplex ret;
469  int evtNum = thrust::get<0>(t);
470  fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
471 
472  unsigned int *indices = paramIndices + parameters; // Jump to DALITZPLOT position within parameters array
473  fptype m12 = evt[indices[2 + indices[0]]];
474  fptype m13 = evt[indices[3 + indices[0]]];
475 
476  fptype motherMass = functorConstants[indices[1] + 0];
477  fptype daug1Mass = functorConstants[indices[1] + 1];
478  fptype daug2Mass = functorConstants[indices[1] + 2];
479  fptype daug3Mass = functorConstants[indices[1] + 3];
480 
481  if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
482  return ret;
483 
484  fptype m23
485  = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
486 
487  int parameter_i
488  = parIndexFromResIndex_DP(resonance_i); // Find position of this resonance relative to DALITZPLOT start
489 
490  unsigned int functn_i = indices[parameter_i + 2];
491  unsigned int params_i = indices[parameter_i + 3];
492 
493  ret = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
494  // printf("Amplitude %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
495  return ret;
496 }
497 
498 } // namespace GooFit