GooFit  v2.1.3
Tddp4Pdf.cu
Go to the documentation of this file.
1 /*
2 04/05/2016 Christoph Hasse
3 DISCLAIMER:
4 
5 This code is not sufficently tested yet and still under heavy development!
6 
7 TODO:
8 - Test lineshapes, only done for BW_DP and BW_MINT so far
9 - Check and implement more SF
10 - Currently no check if the event is even allowed in phasespace is done. This should preferably be done outside of this
11 class.
12 
13 - Some things could be implemented differently maybe, performance should be compared for both cases.
14  -For example the way Spinfactors are stored in the same array as the Lineshape values.
15  Is this really worth the memory we lose by using a complex to store the SF?
16 */
17 #include <goofit/Error.h>
18 #include <goofit/Log.h>
19 #include <goofit/PDFs/physics/DP4Pdf.h>
20 #include <goofit/PDFs/physics/EvalVar.h>
21 #include <goofit/PDFs/physics/Tddp4Pdf.h>
22 #include <goofit/detail/Complex.h>
23 #include <mcbooster/Evaluate.h>
24 #include <mcbooster/EvaluateArray.h>
25 #include <mcbooster/GContainers.h>
26 #include <mcbooster/GFunctional.h>
27 #include <mcbooster/GTypes.h>
28 #include <mcbooster/Generate.h>
29 #include <mcbooster/Vector4R.h>
30 
31 namespace GooFit {
32 
33 struct genExp {
34  fptype gamma;
35  unsigned int offset;
36 
37  __host__ __device__ genExp(unsigned int c, fptype d)
38  : gamma(d)
39  , offset(c){};
40 
41  __host__ __device__ fptype operator()(unsigned int x) const {
42  thrust::random::default_random_engine rand(1431655765);
43  thrust::uniform_real_distribution<fptype> dist(0, 1);
44 
45  rand.discard(x + offset);
46 
47  return -log(dist(rand)) / gamma;
48  }
49 };
50 
51 struct exp_functor {
52  size_t tmpparam, tmpoff;
53  fptype gammamin, wmax;
54  exp_functor(size_t tmpparam, size_t tmpoff, fptype gammamin, fptype wmax)
55  : tmpparam(tmpparam)
56  , tmpoff(tmpoff)
57  , gammamin(gammamin)
58  , wmax(wmax) {}
59 
60  __device__ fptype operator()(thrust::tuple<unsigned int, fptype, fptype *, unsigned int> t) {
61  int evtNum = thrust::get<0>(t);
62  fptype *evt = thrust::get<2>(t) + (evtNum * thrust::get<3>(t));
63  unsigned int *indices = paramIndices + tmpparam;
64  fptype time = evt[indices[8 + indices[0]]];
65 
66  thrust::random::minstd_rand0 rand(1431655765);
67  thrust::uniform_real_distribution<fptype> dist(0, 1);
68  rand.discard(tmpoff + evtNum);
69 
70  return (dist(rand) * exp(-time * gammamin) * wmax) < thrust::get<1>(t);
71  }
72 };
73 
74 // The function of this array is to hold all the cached waves; specific
75 // waves are recalculated when the corresponding resonance mass or width
76 // changes. Note that in a multithread environment each thread needs its
77 // own cache, hence the '10'. Ten threads should be enough for anyone!
78 __device__ fpcomplex *cResSF_TD[10];
79 __device__ fpcomplex *Amps_TD[10];
80 /*
81 Constant memory array to hold specific info for amplitude calculation.
82 First entries are the starting points in array, necessary, because number of Lineshapes(LS) or Spinfactors(SF) can vary
83 |start of each Amplitude| #Linshapes | #Spinfactors | LS-indices | SF-indices|
84 | 1 entry per Amplitude | 1 per Amp | 1 per Amp | #LS in Amp| #SF in Amp|
85 */
86 // __constant__ unsigned int AmpIndices_TD[100];
87 
88 // This function gets called by the GooFit framework to get the value of the PDF.
89 __device__ fptype device_TDDP4(fptype *evt, fptype *p, unsigned int *indices) {
90  // printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real,
91  // totalAmp.imag);
92 
93  auto evtNum = static_cast<int>(floor(0.5 + evt[indices[7 + indices[0]]]));
94  // GOOFIT_TRACE("TDDP4: Number of events: {}", evtNum);
95 
96  unsigned int cacheToUse = indices[2];
97  unsigned int numAmps = indices[5];
98 
99  fpcomplex AmpA(0, 0);
100  fpcomplex AmpB(0, 0);
101  fpcomplex amp_A, amp_B;
102 
103  int k = 0;
104 
105  for(int i = 0; i < numAmps; ++i) {
106  unsigned int start = AmpIndices[i];
107  unsigned int flag = AmpIndices[start + 3 + numAmps];
108  fpcomplex temp;
109 
110  /*printf("flag:%i\n",flag);*/
111  switch(flag) {
112  case 0:
113  amp_A = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
114  temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
115  AmpA += temp * amp_A;
116  break;
117 
118  case 1:
119  amp_B = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
120  temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
121  AmpB += temp * amp_B;
122  break;
123 
124  case 2:
125  amp_A = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
126  temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
127  AmpA += temp * amp_A;
128 
129  ++k;
130  amp_B = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
131  temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
132  AmpB += temp * amp_B;
133  break;
134  }
135  }
136 
137  fptype _tau = p[indices[7]];
138  fptype _xmixing = p[indices[8]];
139  fptype _ymixing = p[indices[9]];
140  fptype _SqWStoRSrate = p[indices[10]];
141  fptype _time = evt[indices[8 + indices[0]]];
142  fptype _sigma = evt[indices[9 + indices[0]]];
143 
144  AmpA *= _SqWStoRSrate;
145  /*printf("%i read time: %.5g x: %.5g y: %.5g \n",evtNum, _time, _xmixing, _ymixing);*/
146 
147  fptype term1 = thrust::norm(AmpA) + thrust::norm(AmpB);
148  fptype term2 = thrust::norm(AmpA) - thrust::norm(AmpB);
149  fpcomplex term3 = AmpA * thrust::conj(AmpB);
150  // printf("%i dev %.7g %.7g %.7g %.7g\n", evtNum, norm2(AmpA), norm2(AmpB), term3.real, term3.imag);
151 
152  int effFunctionIdx = 12 + 2 * indices[3] + 2 * indices[4] + 2 * indices[6];
153  int resfctidx = indices[11];
154  int resfctpar = effFunctionIdx + 2;
155 
156  fptype ret = (*(reinterpret_cast<device_resfunction_ptr>(device_function_table[resfctidx])))(
157  term1, term2, term3.real(), term3.imag(), _tau, _time, _xmixing, _ymixing, _sigma, p, indices + resfctpar);
158  fptype eff = callFunction(evt, indices[effFunctionIdx], indices[effFunctionIdx + 1]);
159  /*printf("%i result %.7g, eff %.7g\n",evtNum, ret, eff);*/
160  ret *= eff;
161  /*printf("in prob: %f\n", ret);*/
162  return ret;
163 }
164 
165 __device__ device_function_ptr ptr_to_TDDP4 = device_TDDP4;
166 
167 __host__ TDDP4::TDDP4(std::string n,
168  std::vector<Observable> observables,
169  DecayInfo4t decay,
170  MixingTimeResolution *Tres,
171  GooPdf *efficiency,
172  Observable *mistag,
173  unsigned int MCeventsNorm)
174  : GooPdf(n)
175  , decayInfo(decay)
176  , resolution(Tres)
177  , totalEventSize(observables.size() + 2) // number of observables plus eventnumber
178 {
179  // should include m12, m34, cos12, cos34, phi, eventnumber, dtime, sigmat. In this order!
180  for(auto &observable : observables) {
181  registerObservable(observable);
182  }
183 
184  std::vector<fptype> decayConstants;
185  decayConstants.push_back(decayInfo.meson_radius);
186 
187  for(double &particle_masse : decayInfo.particle_masses) {
188  decayConstants.push_back(particle_masse);
189  }
190 
191  if(mistag) {
192  registerObservable(*mistag);
193  totalEventSize = 9;
194  decayConstants.push_back(1); // Flags existence of mistag
195  }
196 
197  std::vector<unsigned int> pindices;
198  pindices.push_back(registerConstants(decayConstants.size()));
199  MEMCPY_TO_SYMBOL(functorConstants,
200  &decayConstants[0],
201  decayConstants.size() * sizeof(fptype),
202  cIndex * sizeof(fptype),
203  cudaMemcpyHostToDevice);
204  static int cacheCount = 0;
205  cacheToUse = cacheCount++;
206  pindices.push_back(cacheToUse);
207  pindices.push_back(0); //#LS
208  pindices.push_back(0); //#SF
209  pindices.push_back(0); //#AMP
210  pindices.push_back(0); // number of coefficients, because its not necessary to be equal to number of Amps.
211  pindices.push_back(registerParameter(decayInfo._tau));
212  pindices.push_back(registerParameter(decayInfo._xmixing));
213  pindices.push_back(registerParameter(decayInfo._ymixing));
214  pindices.push_back(registerParameter(decayInfo._SqWStoRSrate));
215  if(resolution->getDeviceFunction() < 0)
216  throw GooFit::GeneralError("The resolution device function index {} must be more than 0",
217  resolution->getDeviceFunction());
218  pindices.push_back(static_cast<unsigned int>(resolution->getDeviceFunction()));
219 
220  // This is the start of reading in the amplitudes and adding the lineshapes and Spinfactors to this PDF
221  // This is done in this way so we don't have multiple copies of one lineshape in one pdf.
222  std::vector<unsigned int> ampidx;
223  std::vector<unsigned int> nPermVec;
224  std::vector<unsigned int> ampidxstart;
225  unsigned int coeff_counter = 0;
226  std::vector<Amplitude *> AmpBuffer;
227 
228  std::vector<Amplitude *> AmpsA = decayInfo.amplitudes;
229  std::vector<Amplitude *> AmpsB = decayInfo.amplitudes_B;
230 
231  for(auto &i : AmpsA) {
232  AmpMap[i->_uniqueDecayStr] = std::make_pair(std::vector<unsigned int>(0), std::vector<unsigned int>(0));
233 
234  // printf("Adding Amplitde A:%s\n",AmpsA[i]->_uniqueDecayStr.c_str());
235 
236  auto LSvec = i->_LS;
237 
238  for(auto &LSIT : LSvec) {
239  auto found = std::find_if(components.begin(), components.end(), [&LSIT](const PdfBase *L) {
240  return (*LSIT) == *(dynamic_cast<const Lineshape *>(L));
241  });
242 
243  if(found != components.end()) {
244  AmpMap[i->_uniqueDecayStr].first.push_back(std::distance(components.begin(), found));
245  // printf("LS %s found at %i\n",(*found)->getName().c_str(),std::distance(components.begin(), found));
246  } else {
247  components.push_back(LSIT);
248  AmpMap[i->_uniqueDecayStr].first.push_back(components.size() - 1);
249  // printf("Adding LS %s\n",(*LSIT)->getName().c_str());
250  }
251  }
252 
253  auto SFvec = i->_SF;
254 
255  for(auto &SFIT : SFvec) {
256  auto found = std::find_if(
257  SpinFactors.begin(), SpinFactors.end(), [&SFIT](const SpinFactor *S) { return (*SFIT) == (*S); });
258 
259  if(found != SpinFactors.end()) {
260  AmpMap[i->_uniqueDecayStr].second.push_back(std::distance(SpinFactors.begin(), found));
261  // printf("SF %s found at %i\n",(*found)->getName().c_str(), std::distance(SpinFactors.begin(), found));
262  } else {
263  SpinFactors.push_back(SFIT);
264  AmpMap[i->_uniqueDecayStr].second.push_back(SpinFactors.size() - 1);
265  // printf("Adding SF %s\n",(*SFIT)->getName().c_str());
266  }
267  }
268 
269  nPermVec.push_back(i->_nPerm);
270  pindices.push_back(registerParameter(i->_ar));
271  pindices.push_back(registerParameter(i->_ai));
272  ++coeff_counter;
273  AmpBuffer.push_back(i);
274  unsigned int flag = 0;
275  auto inB = std::find_if(AmpsB.begin(), AmpsB.end(), [AmpsA, &i](const Amplitude *A) { return *i == (*A); });
276 
277  if(inB != AmpsB.end()) {
278  // printf("Found in AmpsB as well: %s\n", (*inB)->_uniqueDecayStr.c_str());
279  flag = 2;
280  pindices.push_back(registerParameter((*inB)->_ar));
281  pindices.push_back(registerParameter((*inB)->_ai));
282  ++coeff_counter;
283  }
284 
285  ampidxstart.push_back(ampidx.size());
286  std::vector<unsigned int> ls = AmpMap[i->_uniqueDecayStr].first;
287  std::vector<unsigned int> sf = AmpMap[i->_uniqueDecayStr].second;
288  ampidx.push_back(ls.size());
289  ampidx.push_back(sf.size());
290  ampidx.push_back(i->_nPerm);
291  ampidx.push_back(flag);
292  ampidx.insert(ampidx.end(), ls.begin(), ls.end());
293  ampidx.insert(ampidx.end(), sf.begin(), sf.end());
294  }
295 
296  for(auto &i : AmpsB) {
297  unsigned int flag = 1;
298  auto inB
299  = std::find_if(AmpBuffer.begin(), AmpBuffer.end(), [AmpsB, &i](const Amplitude *A) { return *i == (*A); });
300 
301  if(inB != AmpBuffer.end())
302  continue;
303 
304  AmpMap[i->_uniqueDecayStr] = std::make_pair(std::vector<unsigned int>(0), std::vector<unsigned int>(0));
305  // fprintf("Adding Amplitude B %s\n",AmpsB[i]->_uniqueDecayStr.c_str());
306 
307  auto LSvec = i->_LS;
308 
309  for(auto &LSIT : LSvec) {
310  auto found = std::find_if(components.begin(), components.end(), [&LSIT](const PdfBase *L) {
311  return (*LSIT) == *(dynamic_cast<const Lineshape *>(L));
312  });
313 
314  if(found != components.end()) {
315  AmpMap[i->_uniqueDecayStr].first.push_back(std::distance(components.begin(), found));
316  // fprintf("LS %s found at %i\n",(*found)->getName().c_str(), std::distance(components.begin(), found));
317  } else {
318  components.push_back(LSIT);
319  AmpMap[i->_uniqueDecayStr].first.push_back(components.size() - 1);
320  // fprintf("Adding LS %s\n",(*LSIT)->getName().c_str());
321  }
322  }
323 
324  auto SFvec = i->_SF;
325 
326  for(auto &SFIT : SFvec) {
327  auto found = std::find_if(
328  SpinFactors.begin(), SpinFactors.end(), [&SFIT](const SpinFactor *S) { return (*SFIT) == (*S); });
329 
330  if(found != SpinFactors.end()) {
331  AmpMap[i->_uniqueDecayStr].second.push_back(std::distance(SpinFactors.begin(), found));
332  // fprintf("SF %s found at %i\n",(*found)->getName().c_str(), std::distance(SpinFactors.begin(),
333  // found));
334  } else {
335  SpinFactors.push_back(SFIT);
336  AmpMap[i->_uniqueDecayStr].second.push_back(SpinFactors.size() - 1);
337  // fprintf("Adding SF %s\n",(*SFIT)->getName().c_str());
338  }
339  }
340 
341  nPermVec.push_back(i->_nPerm);
342  pindices.push_back(registerParameter(i->_ar));
343  pindices.push_back(registerParameter(i->_ai));
344  ++coeff_counter;
345  ampidxstart.push_back(ampidx.size());
346  std::vector<unsigned int> ls = AmpMap[i->_uniqueDecayStr].first;
347  std::vector<unsigned int> sf = AmpMap[i->_uniqueDecayStr].second;
348  ampidx.push_back(ls.size());
349  ampidx.push_back(sf.size());
350  ampidx.push_back(i->_nPerm);
351  ampidx.push_back(flag);
352  ampidx.insert(ampidx.end(), ls.begin(), ls.end());
353  ampidx.insert(ampidx.end(), sf.begin(), sf.end());
354  }
355 
356  MEMCPY_TO_SYMBOL(
357  AmpIndices, &(ampidxstart[0]), ampidxstart.size() * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
358  MEMCPY_TO_SYMBOL(AmpIndices,
359  &(ampidx[0]),
360  ampidx.size() * sizeof(unsigned int),
361  ampidxstart.size() * sizeof(unsigned int),
362  cudaMemcpyHostToDevice);
363 
364  pindices[2] = components.size();
365  pindices[3] = SpinFactors.size();
366  pindices[4] = AmpMap.size();
367  pindices[5] = coeff_counter;
368 
369  for(auto &component : components) {
370  reinterpret_cast<Lineshape *>(component)->setConstantIndex(cIndex);
371  pindices.push_back(reinterpret_cast<Lineshape *>(component)->getFunctionIndex());
372  pindices.push_back(reinterpret_cast<Lineshape *>(component)->getParameterIndex());
373  }
374 
375  for(auto &SpinFactor : SpinFactors) {
376  pindices.push_back(SpinFactor->getFunctionIndex());
377  pindices.push_back(SpinFactor->getParameterIndex());
378  SpinFactor->setConstantIndex(cIndex);
379  }
380 
381  pindices.push_back(efficiency->getFunctionIndex());
382  pindices.push_back(efficiency->getParameterIndex());
383  components.push_back(efficiency);
384 
385  // In case the resolution function needs parameters, this registers them.
386  resolution->createParameters(pindices, this);
387  GET_FUNCTION_ADDR(ptr_to_TDDP4);
388  initialize(pindices);
389 
390  Integrator = new NormIntegrator_TD(parameters);
391  redoIntegral = new bool[components.size() - 1];
392  cachedMasses = new fptype[components.size() - 1];
393  cachedWidths = new fptype[components.size() - 1];
394 
395  for(int i = 0; i < components.size() - 1; ++i) {
396  redoIntegral[i] = true;
397  cachedMasses[i] = -1;
398  cachedWidths[i] = -1;
399  lscalculators.push_back(new LSCalculator_TD(parameters, i));
400  }
401 
402  for(int i = 0; i < SpinFactors.size(); ++i) {
403  sfcalculators.push_back(new SFCalculator_TD(parameters, i));
404  }
405 
406  for(int i = 0; i < AmpMap.size(); ++i) {
407  AmpCalcs.push_back(new AmpCalc_TD(ampidxstart[i], parameters, nPermVec[i]));
408  }
409 
410  // fprintf(stderr,"#Amp's %i, #LS %i, #SF %i \n", AmpMap.size(), components.size()-1, SpinFactors.size() );
411 
412  std::vector<mcbooster::GReal_t> masses(decayInfo.particle_masses.begin() + 1, decayInfo.particle_masses.end());
413  mcbooster::PhaseSpace phsp(decayInfo.particle_masses[0], masses, MCeventsNorm, generation_offset);
414  phsp.Generate(mcbooster::Vector4R(decayInfo.particle_masses[0], 0.0, 0.0, 0.0));
415  phsp.Unweight();
416 
417  auto nAcc = phsp.GetNAccepted();
418  mcbooster::BoolVector_d flags = phsp.GetAccRejFlags();
419  auto d1 = phsp.GetDaughters(0);
420  auto d2 = phsp.GetDaughters(1);
421  auto d3 = phsp.GetDaughters(2);
422  auto d4 = phsp.GetDaughters(3);
423 
424  auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(d1.begin(), d2.begin(), d3.begin(), d4.begin()));
425  auto zip_end = zip_begin + d1.size();
426  auto new_end = thrust::remove_if(zip_begin, zip_end, flags.begin(), thrust::logical_not<bool>());
427 
428  d1.erase(thrust::get<0>(new_end.get_iterator_tuple()), d1.end());
429  d2.erase(thrust::get<1>(new_end.get_iterator_tuple()), d2.end());
430  d3.erase(thrust::get<2>(new_end.get_iterator_tuple()), d3.end());
431  d4.erase(thrust::get<3>(new_end.get_iterator_tuple()), d4.end());
432 
433  mcbooster::ParticlesSet_d pset(4);
434  pset[0] = &d1;
435  pset[1] = &d2;
436  pset[2] = &d3;
437  pset[3] = &d4;
438 
439  norm_M12 = mcbooster::RealVector_d(nAcc);
440  norm_M34 = mcbooster::RealVector_d(nAcc);
441  norm_CosTheta12 = mcbooster::RealVector_d(nAcc);
442  norm_CosTheta34 = mcbooster::RealVector_d(nAcc);
443  norm_phi = mcbooster::RealVector_d(nAcc);
444 
445  mcbooster::VariableSet_d VarSet(5);
446  VarSet[0] = &norm_M12;
447  VarSet[1] = &norm_M34;
448  VarSet[2] = &norm_CosTheta12;
449  VarSet[3] = &norm_CosTheta34;
450  VarSet[4] = &norm_phi;
451 
452  Dim5 eval = Dim5();
453  mcbooster::EvaluateArray<Dim5>(eval, pset, VarSet);
454 
455  norm_SF = mcbooster::RealVector_d(nAcc * SpinFactors.size());
456  norm_LS = mcbooster::mc_device_vector<fpcomplex>(nAcc * (components.size() - 1));
457  MCevents = nAcc;
458 
459  addSpecialMask(PdfBase::ForceSeparateNorm);
460 }
461 
462 // makes the arrays to chache the lineshape values and spinfactors in CachedResSF and the values of the amplitudes in
463 // cachedAMPs
464 // I made the choice to have spinfactors necxt to the values of the lineshape in memory. I waste memory by doing this
465 // because a spinfactor is saved as complex
466 // It would be nice to test if this is better than having the spinfactors stored seperately.
467 __host__ void TDDP4::setDataSize(unsigned int dataSize, unsigned int evtSize) {
468  // Default 3 is m12, m13, evtNum for DP 2dim, 4-body decay has 5 independent vars plus evtNum = 6
469  totalEventSize = evtSize;
470  if(totalEventSize < 3)
471  throw GooFit::GeneralError("totalEventSize {} must be 3 or more", totalEventSize);
472 
473  if(cachedResSF)
474  delete cachedResSF;
475 
476  if(cachedAMPs)
477  delete cachedAMPs;
478 
479  numEntries = dataSize;
480  cachedResSF = new thrust::device_vector<fpcomplex>(
481  dataSize * (components.size() + SpinFactors.size() - 1)); // -1 because 1 component is efficiency
482  void *dummy = thrust::raw_pointer_cast(cachedResSF->data());
483  MEMCPY_TO_SYMBOL(cResSF_TD, &dummy, sizeof(fpcomplex *), cacheToUse * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
484 
485  cachedAMPs = new thrust::device_vector<fpcomplex>(dataSize * (AmpCalcs.size()));
486  void *dummy2 = thrust::raw_pointer_cast(cachedAMPs->data());
487  MEMCPY_TO_SYMBOL(Amps_TD, &dummy2, sizeof(fpcomplex *), cacheToUse * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
488 
489  setForceIntegrals();
490 }
491 
492 // this is where the actual magic happens. This function does all the calculations!
493 __host__ fptype TDDP4::normalize() const {
494  if(cachedResSF == nullptr)
495  throw GeneralError("You must call dp.setDataSize(currData.getNumEvents(), N) first!");
496 
497  // fprintf(stderr, "start normalize\n");
498  recursiveSetNormalisation(1); // Not going to normalize efficiency,
499  // so set normalisation factor to 1 so it doesn't get multiplied by zero.
500  // Copy at this time to ensure that the SpecialResonanceCalculators, which need the efficiency,
501  // don't get zeroes through multiplying by the normFactor.
502  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
503 
504  // check if MINUIT changed any parameters and if so remember that so we know
505  // we need to recalculate that lineshape and every amp, that uses that lineshape
506  for(unsigned int i = 0; i < components.size() - 1; ++i) {
507  redoIntegral[i] = forceRedoIntegrals;
508 
509  if(!(components[i]->parametersChanged()))
510  continue;
511 
512  redoIntegral[i] = true;
513  }
514 
515  SpinsCalculated = !forceRedoIntegrals;
516  forceRedoIntegrals = false;
517 
518  // just some thrust iterators for the calculation.
519  thrust::constant_iterator<fptype *> dataArray(dev_event_array);
520  thrust::constant_iterator<int> eventSize(totalEventSize);
521  thrust::counting_iterator<int> eventIndex(0);
522 
523  // Calculate spinfactors only once for normalisation events and real events
524  // strided_range is a template implemented in DalitsPlotHelpers.hh
525  // it basically goes through the array by increasing the pointer by a certain amount instead of just one step.
526  if(!SpinsCalculated) {
527  for(int i = 0; i < SpinFactors.size(); ++i) {
528  unsigned int offset = components.size() - 1;
529  thrust::transform(
530  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
531  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, dataArray, eventSize)),
532  strided_range<thrust::device_vector<fpcomplex>::iterator>(
533  cachedResSF->begin() + offset + i, cachedResSF->end(), (components.size() + SpinFactors.size() - 1))
534  .begin(),
535  *(sfcalculators[i]));
536 
537  if(!generation_no_norm) {
538  thrust::transform(
539  thrust::make_zip_iterator(thrust::make_tuple(norm_M12.begin(),
540  norm_M34.begin(),
541  norm_CosTheta12.begin(),
542  norm_CosTheta34.begin(),
543  norm_phi.begin())),
544  thrust::make_zip_iterator(thrust::make_tuple(
545  norm_M12.end(), norm_M34.end(), norm_CosTheta12.end(), norm_CosTheta34.end(), norm_phi.end())),
546  (norm_SF.begin() + (i * MCevents)),
547  NormSpinCalculator_TD(parameters, i));
548  }
549  }
550 
551  SpinsCalculated = true;
552  }
553 
554  // fprintf(stderr, "normalize after spins\n");
555 
556  // this calculates the values of the lineshapes and stores them in the array. It is recalculated every time
557  // parameters change.
558  for(int i = 0; i < components.size() - 1; ++i) {
559  if(redoIntegral[i]) {
560  thrust::transform(
561  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
562  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, dataArray, eventSize)),
563  strided_range<thrust::device_vector<fpcomplex>::iterator>(
564  cachedResSF->begin() + i, cachedResSF->end(), (components.size() + SpinFactors.size() - 1))
565  .begin(),
566  *(lscalculators[i]));
567  }
568  }
569 
570  // fprintf(stderr, "normalize after LS\n");
571 
572  // this is a little messy but it basically checks if the amplitude includes one of the recalculated lineshapes and
573  // if so recalculates that amplitude
574  auto AmpMapIt = AmpMap.begin();
575 
576  for(int i = 0; i < AmpCalcs.size(); ++i) {
577  std::vector<unsigned int> redoidx((*AmpMapIt).second.first);
578  bool redo = false;
579 
580  for(unsigned int j : redoidx) {
581  if(!redoIntegral[j])
582  continue;
583 
584  redo = true;
585  break;
586  }
587 
588  if(redo) {
589  thrust::transform(eventIndex,
590  eventIndex + numEntries,
591  strided_range<thrust::device_vector<fpcomplex>::iterator>(
592  cachedAMPs->begin() + i, cachedAMPs->end(), AmpCalcs.size())
593  .begin(),
594  *(AmpCalcs[i]));
595  }
596  }
597 
598  // fprintf(stderr, "normalize after Amps\n");
599 
600  // lineshape value calculation for the normalisation, also recalculated every time parameter change
601  if(!generation_no_norm) {
602  for(int i = 0; i < components.size() - 1; ++i) {
603  if(!redoIntegral[i])
604  continue;
605 
606  thrust::transform(
607  thrust::make_zip_iterator(thrust::make_tuple(norm_M12.begin(),
608  norm_M34.begin(),
609  norm_CosTheta12.begin(),
610  norm_CosTheta34.begin(),
611  norm_phi.begin())),
612  thrust::make_zip_iterator(thrust::make_tuple(
613  norm_M12.end(), norm_M34.end(), norm_CosTheta12.end(), norm_CosTheta34.end(), norm_phi.end())),
614  (norm_LS.begin() + (i * MCevents)),
615  NormLSCalculator_TD(parameters, i));
616  }
617  }
618 
619  thrust::constant_iterator<fptype *> normSFaddress(thrust::raw_pointer_cast(norm_SF.data()));
620  thrust::constant_iterator<fpcomplex *> normLSaddress(thrust::raw_pointer_cast(norm_LS.data()));
621  thrust::constant_iterator<int> NumNormEvents(MCevents);
622 
623  // this does the rest of the integration with the cached lineshape and spinfactor values for the normalization
624  // events
625  auto ret = 1.0;
626 
627  if(!generation_no_norm) {
628  thrust::tuple<fptype, fptype, fptype, fptype> dummy(0, 0, 0, 0);
629  FourDblTupleAdd MyFourDoubleTupleAdditionFunctor;
630  thrust::tuple<fptype, fptype, fptype, fptype> sumIntegral;
631  sumIntegral = thrust::transform_reduce(
632  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, NumNormEvents, normSFaddress, normLSaddress)),
633  thrust::make_zip_iterator(
634  thrust::make_tuple(eventIndex + MCevents, NumNormEvents, normSFaddress, normLSaddress)),
635  *Integrator,
636  dummy,
637  MyFourDoubleTupleAdditionFunctor);
638 
639  // printf("normalize A2/#evts , B2/#evts: %.5g, %.5g\n",thrust::get<0>(sumIntegral)/MCevents,
640  // thrust::get<1>(sumIntegral)/MCevents);
641  fptype tau = host_params[host_indices[parameters + 7]];
642  fptype xmixing = host_params[host_indices[parameters + 8]];
643  fptype ymixing = host_params[host_indices[parameters + 9]];
644 
645  ret = resolution->normalisation(thrust::get<0>(sumIntegral),
646  thrust::get<1>(sumIntegral),
647  thrust::get<2>(sumIntegral),
648  thrust::get<3>(sumIntegral),
649  tau,
650  xmixing,
651  ymixing);
652 
653  // MCevents is the number of normalisation events.
654  ret /= MCevents;
655  }
656 
657  host_normalisation[parameters] = 1.0 / ret;
658  // printf("end of normalize %f\n", ret);
659  return ret;
660 }
661 
662 __host__
663  std::tuple<mcbooster::ParticlesSet_h, mcbooster::VariableSet_h, mcbooster::RealVector_h, mcbooster::BoolVector_h>
664  TDDP4::GenerateSig(unsigned int numEvents) {
665  copyParams();
666 
667  std::vector<mcbooster::GReal_t> masses(decayInfo.particle_masses.begin() + 1, decayInfo.particle_masses.end());
668  mcbooster::PhaseSpace phsp(decayInfo.particle_masses[0], masses, numEvents, generation_offset);
669  phsp.Generate(mcbooster::Vector4R(decayInfo.particle_masses[0], 0.0, 0.0, 0.0));
670 
671  phsp.Unweight();
672 
673  auto nAcc = phsp.GetNAccepted();
674  mcbooster::BoolVector_d flags = phsp.GetAccRejFlags();
675  auto d1 = phsp.GetDaughters(0);
676  auto d2 = phsp.GetDaughters(1);
677  auto d3 = phsp.GetDaughters(2);
678  auto d4 = phsp.GetDaughters(3);
679 
680  auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(d1.begin(), d2.begin(), d3.begin(), d4.begin()));
681  auto zip_end = zip_begin + d1.size();
682  auto new_end = thrust::remove_if(zip_begin, zip_end, flags.begin(), thrust::logical_not<bool>());
683 
684  flags = mcbooster::BoolVector_d();
685 
686  d1.erase(thrust::get<0>(new_end.get_iterator_tuple()), d1.end());
687  d2.erase(thrust::get<1>(new_end.get_iterator_tuple()), d2.end());
688  d3.erase(thrust::get<2>(new_end.get_iterator_tuple()), d3.end());
689  d4.erase(thrust::get<3>(new_end.get_iterator_tuple()), d4.end());
690 
691  mcbooster::ParticlesSet_d pset(4);
692  pset[0] = &d1;
693  pset[1] = &d2;
694  pset[2] = &d3;
695  pset[3] = &d4;
696 
697  auto SigGen_M12_d = mcbooster::RealVector_d(nAcc);
698  auto SigGen_M34_d = mcbooster::RealVector_d(nAcc);
699  auto SigGen_CosTheta12_d = mcbooster::RealVector_d(nAcc);
700  auto SigGen_CosTheta34_d = mcbooster::RealVector_d(nAcc);
701  auto SigGen_phi_d = mcbooster::RealVector_d(nAcc);
702  auto dtime_d = mcbooster::RealVector_d(nAcc);
703 
704  thrust::counting_iterator<unsigned int> index_sequence_begin(0);
705 
706  fptype tau = host_params[host_indices[parameters + 7]];
707  fptype ymixing = host_params[host_indices[parameters + 9]];
708  fptype gammamin = 1.0 / tau - fabs(ymixing) / tau;
709  /*printf("hostparams: %f, %f", tau, ymixing);*/
710 
711  thrust::transform(
712  index_sequence_begin, index_sequence_begin + nAcc, dtime_d.begin(), genExp(generation_offset, gammamin));
713 
714  mcbooster::VariableSet_d VarSet_d(5);
715  VarSet_d[0] = &SigGen_M12_d;
716  VarSet_d[1] = &SigGen_M34_d;
717  VarSet_d[2] = &SigGen_CosTheta12_d;
718  VarSet_d[3] = &SigGen_CosTheta34_d;
719  VarSet_d[4] = &SigGen_phi_d;
720 
721  Dim5 eval = Dim5();
722  mcbooster::EvaluateArray<Dim5>(eval, pset, VarSet_d);
723 
724  auto h1 = new mcbooster::Particles_h(d1);
725  auto h2 = new mcbooster::Particles_h(d2);
726  auto h3 = new mcbooster::Particles_h(d3);
727  auto h4 = new mcbooster::Particles_h(d4);
728 
729  mcbooster::ParticlesSet_h ParSet(4);
730  ParSet[0] = h1;
731  ParSet[1] = h2;
732  ParSet[2] = h3;
733  ParSet[3] = h4;
734 
735  auto SigGen_M12_h = new mcbooster::RealVector_h(SigGen_M12_d);
736  auto SigGen_M34_h = new mcbooster::RealVector_h(SigGen_M34_d);
737  auto SigGen_CosTheta12_h = new mcbooster::RealVector_h(SigGen_CosTheta12_d);
738  auto SigGen_CosTheta34_h = new mcbooster::RealVector_h(SigGen_CosTheta34_d);
739  auto SigGen_phi_h = new mcbooster::RealVector_h(SigGen_phi_d);
740  auto dtime_h = new mcbooster::RealVector_h(dtime_d);
741 
742  mcbooster::VariableSet_h VarSet(6);
743  VarSet[0] = SigGen_M12_h;
744  VarSet[1] = SigGen_M34_h;
745  VarSet[2] = SigGen_CosTheta12_h;
746  VarSet[3] = SigGen_CosTheta34_h;
747  VarSet[4] = SigGen_phi_h;
748  VarSet[5] = dtime_h;
749 
750  phsp.~PhaseSpace();
751 
752  auto DS = new mcbooster::RealVector_d(8 * nAcc);
753  thrust::counting_iterator<int> eventNumber(0);
754 
755 #pragma unroll
756 
757  for(int i = 0; i < 5; ++i) {
758  mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr(DS->begin() + i, DS->end(), 8);
759  thrust::copy(VarSet_d[i]->begin(), VarSet_d[i]->end(), sr.begin());
760  }
761 
762  mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr(DS->begin() + 5, DS->end(), 8);
763  thrust::copy(eventNumber, eventNumber + nAcc, sr.begin());
764 
765  mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr2(DS->begin() + 6, DS->end(), 8);
766  thrust::fill_n(sr2.begin(), nAcc, 0);
767 
768  dev_event_array = thrust::raw_pointer_cast(DS->data());
769  setDataSize(nAcc, 8);
770 
771  generation_no_norm = true; // we need no normalization for generation, but we do need to make sure that norm = 1;
772  SigGenSetIndices();
773  normalize();
774  setForceIntegrals();
775  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
776 
777  thrust::device_vector<fptype> weights(nAcc);
778  thrust::constant_iterator<int> eventSize(8);
779  thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
780  thrust::counting_iterator<int> eventIndex(0);
781 
782  MetricTaker evalor(this, getMetricPointer("ptr_to_Prob"));
783  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
784  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, arrayAddress, eventSize)),
785  weights.begin(),
786  evalor);
787  cudaDeviceSynchronize();
788 
789  fptype wmax = 1.1 * (fptype)*thrust::max_element(weights.begin(), weights.end());
790 
791  if(wmax > maxWeight && maxWeight != 0)
792  fprintf(stderr,
793  "WARNING: you just encountered a higher maximum weight than observed in previous iterations.\n"
794  "WARNING: Consider recalculating your AccRej flags and acceping based upon these.\n"
795  "WARNING: previous weight: %.4g, new weight: %.4g\n",
796  maxWeight,
797  wmax);
798 
799  maxWeight = wmax > maxWeight ? wmax : maxWeight;
800 
801  thrust::copy(dtime_d.begin(), dtime_d.end(), sr2.begin());
802 
803  dtime_d = mcbooster::RealVector_d();
804  thrust::device_vector<fptype> results(nAcc);
805 
806  thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
807  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, arrayAddress, eventSize)),
808  results.begin(),
809  evalor);
810 
811  cudaDeviceSynchronize();
812 
813  thrust::device_vector<fptype> flag2(nAcc);
814  thrust::counting_iterator<mcbooster::GLong_t> first(0);
815  // thrust::counting_iterator<mcbooster::GLong_t> last = first + nAcc;
816 
817  // we do not want to copy the whole class to the GPU so capturing *this is not a great option
818  // therefore perpare local copies to capture the variables we need
819  unsigned int tmpoff = generation_offset;
820  unsigned int tmpparam = parameters;
821  wmax = maxWeight;
822 
823  thrust::transform(
824  thrust::make_zip_iterator(thrust::make_tuple(eventIndex, results.begin(), arrayAddress, eventSize)),
825  thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, results.end(), arrayAddress, eventSize)),
826  flag2.begin(),
827  exp_functor(tmpparam, tmpoff, gammamin, wmax));
828 
829  gooFree(dev_event_array);
830 
831  /*printf("Offset: %i und wmax:%.5g\n",generation_offset, wmax );*/
832 
833  auto weights_h = mcbooster::RealVector_h(weights);
834  auto results_h = mcbooster::RealVector_h(results);
835  auto flags_h = mcbooster::BoolVector_h(flag2);
836  cudaDeviceSynchronize();
837 
838  return std::make_tuple(ParSet, VarSet, weights_h, flags_h);
839 }
840 
841 SFCalculator_TD::SFCalculator_TD(int pIdx, unsigned int sf_idx)
842  : _spinfactor_i(sf_idx)
843  , _parameters(pIdx) {}
844 
845 __device__ fpcomplex SFCalculator_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
846  int evtNum = thrust::get<0>(t);
847  fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
848 
849  unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
850  int parameter_i = 12 + (2 * indices[6]) + (indices[3] * 2)
851  + (_spinfactor_i * 2); // Find position of this resonance relative to DALITZPLOT start
852  unsigned int functn_i = indices[parameter_i];
853  unsigned int params_i = indices[parameter_i + 1];
854 
855  fptype m12 = evt[indices[2 + indices[0]]];
856  fptype m34 = evt[indices[3 + indices[0]]];
857  fptype cos12 = evt[indices[4 + indices[0]]];
858  fptype cos34 = evt[indices[5 + indices[0]]];
859  fptype phi = evt[indices[6 + indices[0]]];
860 
861  fptype vecs[16];
862  get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
863  // printf("%i, %i, %f, %f, %f, %f, %f \n",evtNum, thrust::get<2>(t), m12, m34, cos12, cos34, phi );
864  // printf("vec%i %f, %f, %f, %f\n",0, vecs[0], vecs[1], vecs[2], vecs[3]);
865  // printf("vec%i %f, %f, %f, %f\n",1, vecs[4], vecs[5], vecs[6], vecs[7]);
866  // printf("vec%i %f, %f, %f, %f\n",2, vecs[8], vecs[9], vecs[10], vecs[11]);
867  // printf("vec%i %f, %f, %f, %f\n",3, vecs[12], vecs[13], vecs[14], vecs[15]);
868 
869  auto func = reinterpret_cast<spin_function_ptr>(device_function_table[functn_i]);
870  fptype sf = (*func)(vecs, paramIndices + params_i);
871  // printf("SpinFactors %i : %.7g\n",_spinfactor_i, sf );
872  return {sf, 0.};
873 }
874 
875 NormSpinCalculator_TD::NormSpinCalculator_TD(int pIdx, unsigned int sf_idx)
876  : _spinfactor_i(sf_idx)
877  , _parameters(pIdx) {}
878 
879 __device__ fptype NormSpinCalculator_TD::operator()(
880  thrust::tuple<mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t> t)
881  const {
882  unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
883  int parameter_i = 12 + (2 * indices[6]) + (indices[3] * 2)
884  + (_spinfactor_i * 2); // Find position of this resonance relative to DALITZPLOT start
885  unsigned int functn_i = indices[parameter_i];
886  unsigned int params_i = indices[parameter_i + 1];
887 
888  fptype m12 = (thrust::get<0>(t));
889  fptype m34 = (thrust::get<1>(t));
890  fptype cos12 = (thrust::get<2>(t));
891  fptype cos34 = (thrust::get<3>(t));
892  fptype phi = (thrust::get<4>(t));
893 
894  fptype vecs[16];
895  get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
896 
897  // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,0, vecs[0], vecs[1], vecs[2], vecs[3]);
898  // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,1, vecs[4], vecs[5], vecs[6], vecs[7]);
899  // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,2, vecs[8], vecs[9], vecs[10], vecs[11]);
900  // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,3, vecs[12], vecs[13], vecs[14], vecs[15]);
901  // // }
902  auto func = reinterpret_cast<spin_function_ptr>(device_function_table[functn_i]);
903  fptype sf = (*func)(vecs, paramIndices + params_i);
904 
905  // printf("NormSF evt:%.5g, %.5g, %.5g, %.5g, %.5g\n", m12, m34, cos12, cos34, phi);
906  // printf("NormSF %i, %.7g\n",_spinfactor_i, sf );
907  // THREAD_SYNCH
908  return sf;
909 }
910 
911 LSCalculator_TD::LSCalculator_TD(int pIdx, unsigned int res_idx)
912  : _resonance_i(res_idx)
913  , _parameters(pIdx) {}
914 
915 __device__ fpcomplex LSCalculator_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
916  // Calculates the BW values for a specific resonance.
917  fpcomplex ret;
918 
919  int evtNum = thrust::get<0>(t);
920  fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
921 
922  unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
923  int parameter_i
924  = 12 + (2 * indices[6]) + (_resonance_i * 2); // Find position of this resonance relative to DALITZPLOT start
925  unsigned int functn_i = indices[parameter_i];
926  unsigned int params_i = indices[parameter_i + 1];
927  unsigned int pair = (paramIndices + params_i)[5];
928 
929  fptype m1 = functorConstants[indices[1] + 2];
930  fptype m2 = functorConstants[indices[1] + 3];
931  fptype m3 = functorConstants[indices[1] + 4];
932  fptype m4 = functorConstants[indices[1] + 5];
933 
934  fptype m12 = evt[indices[2 + indices[0]]];
935  fptype m34 = evt[indices[3 + indices[0]]];
936  fptype cos12 = evt[indices[4 + indices[0]]];
937  fptype cos34 = evt[indices[5 + indices[0]]];
938  fptype phi = evt[indices[6 + indices[0]]];
939 
940  if(pair < 2) {
941  fptype mres = pair == 0 ? m12 : m34;
942  fptype d1 = pair == 0 ? m1 : m3;
943  fptype d2 = pair == 0 ? m2 : m4;
944  ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
945  // printf("LS_nt %i: mass:%f, %f i%f\n",_resonance_i, mres, ret.real, ret.imag );
946  } else {
947  fptype vecs[16];
948  get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
949  fptype d1, d2;
950  fptype mres = getmass(pair, d1, d2, vecs, m1, m2, m3, m4);
951  ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
952  // printf("LS %i: mass:%f, %f i%f\n",_resonance_i, mres, ret.real, ret.imag );
953  }
954 
955  // printf("LS %i: %.7g, %.7g, %.7g, %.7g, %.7g \n",evtNum, m12, m34, cos12, cos34, phi );
956 
957  // if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return ret;
958  // printf("m12 %f \n", m12); // %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
959  // printf("#Parameters %i, #LS %i, #SF %i, #AMP %i \n", indices[0], indices[3], indices[4], indices[5]);
960  // printf("BW_%i : %f %f\n", _resonance_i, ret.real, ret.imag);
961 
962  return ret;
963 }
964 
965 NormLSCalculator_TD::NormLSCalculator_TD(int pIdx, unsigned int res_idx)
966  : _resonance_i(res_idx)
967  , _parameters(pIdx) {}
968 
969 __device__ fpcomplex NormLSCalculator_TD::operator()(
970  thrust::tuple<mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t> t)
971  const {
972  // Calculates the BW values for a specific resonance.
973  fpcomplex ret;
974 
975  unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
976  int parameter_i
977  = 12 + (2 * indices[6]) + (_resonance_i * 2); // Find position of this resonance relative to DALITZPLOT start
978  unsigned int functn_i = indices[parameter_i];
979  unsigned int params_i = indices[parameter_i + 1];
980  unsigned int pair = (paramIndices + params_i)[5];
981 
982  fptype m1 = functorConstants[indices[1] + 2];
983  fptype m2 = functorConstants[indices[1] + 3];
984  fptype m3 = functorConstants[indices[1] + 4];
985  fptype m4 = functorConstants[indices[1] + 5];
986 
987  fptype m12 = (thrust::get<0>(t));
988  fptype m34 = (thrust::get<1>(t));
989  fptype cos12 = (thrust::get<2>(t));
990  fptype cos34 = (thrust::get<3>(t));
991  fptype phi = (thrust::get<4>(t));
992 
993  if(pair < 2) {
994  fptype mres = pair == 0 ? m12 : m34;
995  fptype d1 = pair == 0 ? m1 : m3;
996  fptype d2 = pair == 0 ? m2 : m4;
997  ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
998  } else {
999  fptype vecs[16];
1000  get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
1001  fptype d1, d2;
1002  fptype mres = getmass(pair, d1, d2, vecs, m1, m2, m3, m4);
1003  ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
1004  }
1005 
1006  // printf("NormLS %f, %f, %f, %f, %f \n",m12, m34, cos12, cos34, phi );
1007  // printf("%i, %i, %i, %i, %i \n",numLS, numSF, numAmps, offset, evtNum );
1008  // printf("NLS %i, %f, %f\n",_resonance_i,ret.real, ret.imag);
1009 
1010  // printf("m12 %f \n", m12); // %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
1011  // printf("#Parameters %i, #LS %i, #SF %i, #AMP %i \n", indices[0], indices[3], indices[4], indices[5]);
1012  // THREAD_SYNCH
1013  return ret;
1014 }
1015 
1016 AmpCalc_TD::AmpCalc_TD(unsigned int AmpIdx, unsigned int pIdx, unsigned int nPerm)
1017  : _nPerm(nPerm)
1018  , _AmpIdx(AmpIdx)
1019  , _parameters(pIdx) {}
1020 
1021 __device__ fpcomplex AmpCalc_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
1022  unsigned int *indices = paramIndices + _parameters;
1023  unsigned int cacheToUse = indices[2];
1024  unsigned int totalLS = indices[3];
1025  unsigned int totalSF = indices[4];
1026  unsigned int totalAMP = indices[5];
1027  unsigned int offset = totalLS + totalSF;
1028  unsigned int numLS = AmpIndices[totalAMP + _AmpIdx];
1029  unsigned int numSF = AmpIndices[totalAMP + _AmpIdx + 1];
1030  unsigned int evtNum = thrust::get<0>(t);
1031 
1032  fpcomplex returnVal(0, 0);
1033  unsigned int SF_step = numSF / _nPerm;
1034  unsigned int LS_step = numLS / _nPerm;
1035 
1036  for(int i = 0; i < _nPerm; ++i) {
1037  fpcomplex ret(1, 0);
1038 
1039  for(int j = i * LS_step; j < (i + 1) * LS_step; ++j) {
1040  ret *= (cResSF_TD[cacheToUse][evtNum * offset + AmpIndices[totalAMP + _AmpIdx + 4 + j]]);
1041  // printf("Lineshape %i = (%.7g, %.7g)\n", j, (cResSF_TD[cacheToUse][evtNum*offset + AmpIndices[totalAMP +
1042  // _AmpIdx + 4 + j]]).real, (cResSF_TD[cacheToUse][evtNum*offset + AmpIndices[totalAMP + _AmpIdx + 4 +
1043  // j]]).imag);
1044  }
1045 
1046  // printf("Lineshape Product = (%.7g, %.7g)\n", ret.real, ret.imag);
1047  for(int j = i * SF_step; j < (i + 1) * SF_step; ++j) {
1048  ret *= (cResSF_TD[cacheToUse][evtNum * offset + totalLS + AmpIndices[totalAMP + _AmpIdx + 4 + numLS + j]]
1049  .real());
1050  // printf(" SF = %.7g\n", (cResSF_TD[cacheToUse][evtNum*offset + totalLS + AmpIndices[totalAMP + _AmpIdx + 4
1051  // + numLS + j]].real));
1052  }
1053 
1054  // printf("Lineshape Product * SF = (%.7g, %.7g)\n", ret.real, ret.imag);
1055 
1056  returnVal += ret;
1057  }
1058 
1059  returnVal *= (1 / sqrt(static_cast<fptype>(_nPerm)));
1060  // printf("Amplitude Value = (%.7g, %.7g)\n", returnVal.real, returnVal.imag);
1061  return returnVal;
1062 }
1063 
1064 NormIntegrator_TD::NormIntegrator_TD(unsigned int pIdx)
1065  : _parameters(pIdx) {}
1066 
1067 __device__ thrust::tuple<fptype, fptype, fptype, fptype> NormIntegrator_TD::
1068 operator()(thrust::tuple<int, int, fptype *, fpcomplex *> t) const {
1069  unsigned int *indices = paramIndices + _parameters;
1070  unsigned int totalAMP = indices[5];
1071 
1072  unsigned int evtNum = thrust::get<0>(t);
1073  unsigned int MCevents = thrust::get<1>(t);
1074  fptype *SFnorm = thrust::get<2>(t) + evtNum;
1075  fpcomplex *LSnorm = thrust::get<3>(t) + evtNum;
1076 
1077  fpcomplex AmpA(0, 0);
1078  fpcomplex AmpB(0, 0);
1079  fpcomplex amp_A, amp_B;
1080 
1081  int k = 0;
1082 
1083  for(int amp = 0; amp < totalAMP; ++amp) {
1084  unsigned int ampidx = AmpIndices[amp];
1085  unsigned int numLS = AmpIndices[totalAMP + ampidx];
1086  unsigned int numSF = AmpIndices[totalAMP + ampidx + 1];
1087  unsigned int nPerm = AmpIndices[totalAMP + ampidx + 2];
1088  unsigned int flag = AmpIndices[totalAMP + ampidx + 3];
1089  unsigned int SF_step = numSF / nPerm;
1090  unsigned int LS_step = numLS / nPerm;
1091  fpcomplex ret2(0, 0);
1092  // printf("%i, %i, %i, %i, %i, %i, %i, %i, %i, %f\n",ampidx, amp, numLS, numSF, nPerm,AmpIndices[totalAMP +
1093  // ampidx + 4 + 0], AmpIndices[totalAMP + ampidx + 4 + 1], AmpIndices[totalAMP + ampidx + 4 + 2],
1094  // AmpIndices[totalAMP + ampidx + 4 + 3], (1/sqrt((fptype)(nPerm))) );
1095 
1096  for(int j = 0; j < nPerm; ++j) {
1097  fpcomplex ret(1, 0);
1098 
1099  for(int i = j * LS_step; i < (j + 1) * LS_step; ++i) {
1100  fpcomplex matrixelement(LSnorm[AmpIndices[totalAMP + ampidx + 4 + i] * MCevents]);
1101  // printf("Norm BW %i, %.5g, %.5g\n",AmpIndices[totalAMP + ampidx + 4 + i] , matrixelement.real,
1102  // matrixelement.imag);
1103  ret *= matrixelement;
1104  }
1105 
1106  for(int i = j * SF_step; i < (j + 1) * SF_step; ++i) {
1107  fptype matrixelement = (SFnorm[AmpIndices[totalAMP + ampidx + 4 + numLS + i] * MCevents]);
1108  // printf("Norm SF %i, %.5g\n",AmpIndices[totalAMP + ampidx + 4 + i] , matrixelement);
1109  ret *= matrixelement;
1110  }
1111 
1112  ret2 += ret;
1113  }
1114 
1115  ret2 *= (1 / sqrt(static_cast<fptype>(nPerm)));
1116  // printf("Result Amplitude %i, %i, %.5g, %.5g\n",flag, amp, ret2.real, ret2.imag);
1117 
1118  switch(flag) {
1119  case 0:
1120  amp_A = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1121  AmpA += ret2 * amp_A;
1122  break;
1123 
1124  case 1:
1125  amp_B = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1126  AmpB += ret2 * amp_B;
1127  break;
1128 
1129  case 2:
1130  amp_A = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1131  AmpA += ret2 * amp_A;
1132  ++k;
1133  amp_B = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1134  AmpB += ret2 * amp_B;
1135  break;
1136  }
1137  }
1138 
1139  fptype _SqWStoRSrate = cudaArray[indices[10]];
1140  AmpA *= _SqWStoRSrate;
1141 
1142  auto AmpAB = AmpA * conj(AmpB);
1143  return {thrust::norm(AmpA), thrust::norm(AmpB), AmpAB.real(), AmpAB.imag()};
1144 }
1145 
1146 } // namespace GooFit