2 04/05/2016 Christoph Hasse
5 This code is not sufficently tested yet and still under heavy development!
8 - Test lineshapes, only done for BW_DP and BW_MINT so far
9 - Check and implement more SF
10 - Currently no check if the event is even allowed in phasespace is done. This should preferably be done outside of this
13 - Some things could be implemented differently maybe, performance should be compared for both cases.
14 -For example the way Spinfactors are stored in the same array as the Lineshape values.
15 Is this really worth the memory we lose by using a complex to store the SF?
17 #include <goofit/Error.h>
18 #include <goofit/Log.h>
19 #include <goofit/PDFs/physics/DP4Pdf.h>
20 #include <goofit/PDFs/physics/EvalVar.h>
21 #include <goofit/PDFs/physics/Tddp4Pdf.h>
22 #include <goofit/detail/Complex.h>
23 #include <mcbooster/Evaluate.h>
24 #include <mcbooster/EvaluateArray.h>
25 #include <mcbooster/GContainers.h>
26 #include <mcbooster/GFunctional.h>
27 #include <mcbooster/GTypes.h>
28 #include <mcbooster/Generate.h>
29 #include <mcbooster/Vector4R.h>
37 __host__ __device__ genExp(unsigned int c, fptype d)
41 __host__ __device__ fptype operator()(unsigned int x) const {
42 thrust::random::default_random_engine rand(1431655765);
43 thrust::uniform_real_distribution<fptype> dist(0, 1);
45 rand.discard(x + offset);
47 return -log(dist(rand)) / gamma;
52 size_t tmpparam, tmpoff;
53 fptype gammamin, wmax;
54 exp_functor(size_t tmpparam, size_t tmpoff, fptype gammamin, fptype wmax)
60 __device__ fptype operator()(thrust::tuple<unsigned int, fptype, fptype *, unsigned int> t) {
61 int evtNum = thrust::get<0>(t);
62 fptype *evt = thrust::get<2>(t) + (evtNum * thrust::get<3>(t));
63 unsigned int *indices = paramIndices + tmpparam;
64 fptype time = evt[indices[8 + indices[0]]];
66 thrust::random::minstd_rand0 rand(1431655765);
67 thrust::uniform_real_distribution<fptype> dist(0, 1);
68 rand.discard(tmpoff + evtNum);
70 return (dist(rand) * exp(-time * gammamin) * wmax) < thrust::get<1>(t);
74 // The function of this array is to hold all the cached waves; specific
75 // waves are recalculated when the corresponding resonance mass or width
76 // changes. Note that in a multithread environment each thread needs its
77 // own cache, hence the '10'. Ten threads should be enough for anyone!
78 __device__ fpcomplex *cResSF_TD[10];
79 __device__ fpcomplex *Amps_TD[10];
81 Constant memory array to hold specific info for amplitude calculation.
82 First entries are the starting points in array, necessary, because number of Lineshapes(LS) or Spinfactors(SF) can vary
83 |start of each Amplitude| #Linshapes | #Spinfactors | LS-indices | SF-indices|
84 | 1 entry per Amplitude | 1 per Amp | 1 per Amp | #LS in Amp| #SF in Amp|
86 // __constant__ unsigned int AmpIndices_TD[100];
88 // This function gets called by the GooFit framework to get the value of the PDF.
89 __device__ fptype device_TDDP4(fptype *evt, fptype *p, unsigned int *indices) {
90 // printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real,
93 auto evtNum = static_cast<int>(floor(0.5 + evt[indices[7 + indices[0]]]));
94 // GOOFIT_TRACE("TDDP4: Number of events: {}", evtNum);
96 unsigned int cacheToUse = indices[2];
97 unsigned int numAmps = indices[5];
100 fpcomplex AmpB(0, 0);
101 fpcomplex amp_A, amp_B;
105 for(int i = 0; i < numAmps; ++i) {
106 unsigned int start = AmpIndices[i];
107 unsigned int flag = AmpIndices[start + 3 + numAmps];
110 /*printf("flag:%i\n",flag);*/
113 amp_A = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
114 temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
115 AmpA += temp * amp_A;
119 amp_B = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
120 temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
121 AmpB += temp * amp_B;
125 amp_A = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
126 temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
127 AmpA += temp * amp_A;
130 amp_B = fpcomplex(p[indices[12 + 2 * (i + k)]], p[indices[13 + 2 * (i + k)]]);
131 temp = Amps_TD[cacheToUse][evtNum * numAmps + i];
132 AmpB += temp * amp_B;
137 fptype _tau = p[indices[7]];
138 fptype _xmixing = p[indices[8]];
139 fptype _ymixing = p[indices[9]];
140 fptype _SqWStoRSrate = p[indices[10]];
141 fptype _time = evt[indices[8 + indices[0]]];
142 fptype _sigma = evt[indices[9 + indices[0]]];
144 AmpA *= _SqWStoRSrate;
145 /*printf("%i read time: %.5g x: %.5g y: %.5g \n",evtNum, _time, _xmixing, _ymixing);*/
147 fptype term1 = thrust::norm(AmpA) + thrust::norm(AmpB);
148 fptype term2 = thrust::norm(AmpA) - thrust::norm(AmpB);
149 fpcomplex term3 = AmpA * thrust::conj(AmpB);
150 // printf("%i dev %.7g %.7g %.7g %.7g\n", evtNum, norm2(AmpA), norm2(AmpB), term3.real, term3.imag);
152 int effFunctionIdx = 12 + 2 * indices[3] + 2 * indices[4] + 2 * indices[6];
153 int resfctidx = indices[11];
154 int resfctpar = effFunctionIdx + 2;
156 fptype ret = (*(reinterpret_cast<device_resfunction_ptr>(device_function_table[resfctidx])))(
157 term1, term2, term3.real(), term3.imag(), _tau, _time, _xmixing, _ymixing, _sigma, p, indices + resfctpar);
158 fptype eff = callFunction(evt, indices[effFunctionIdx], indices[effFunctionIdx + 1]);
159 /*printf("%i result %.7g, eff %.7g\n",evtNum, ret, eff);*/
161 /*printf("in prob: %f\n", ret);*/
165 __device__ device_function_ptr ptr_to_TDDP4 = device_TDDP4;
167 __host__ TDDP4::TDDP4(std::string n,
168 std::vector<Observable> observables,
170 MixingTimeResolution *Tres,
173 unsigned int MCeventsNorm)
177 , totalEventSize(observables.size() + 2) // number of observables plus eventnumber
179 // should include m12, m34, cos12, cos34, phi, eventnumber, dtime, sigmat. In this order!
180 for(auto &observable : observables) {
181 registerObservable(observable);
184 std::vector<fptype> decayConstants;
185 decayConstants.push_back(decayInfo.meson_radius);
187 for(double &particle_masse : decayInfo.particle_masses) {
188 decayConstants.push_back(particle_masse);
192 registerObservable(*mistag);
194 decayConstants.push_back(1); // Flags existence of mistag
197 std::vector<unsigned int> pindices;
198 pindices.push_back(registerConstants(decayConstants.size()));
199 MEMCPY_TO_SYMBOL(functorConstants,
201 decayConstants.size() * sizeof(fptype),
202 cIndex * sizeof(fptype),
203 cudaMemcpyHostToDevice);
204 static int cacheCount = 0;
205 cacheToUse = cacheCount++;
206 pindices.push_back(cacheToUse);
207 pindices.push_back(0); //#LS
208 pindices.push_back(0); //#SF
209 pindices.push_back(0); //#AMP
210 pindices.push_back(0); // number of coefficients, because its not necessary to be equal to number of Amps.
211 pindices.push_back(registerParameter(decayInfo._tau));
212 pindices.push_back(registerParameter(decayInfo._xmixing));
213 pindices.push_back(registerParameter(decayInfo._ymixing));
214 pindices.push_back(registerParameter(decayInfo._SqWStoRSrate));
215 if(resolution->getDeviceFunction() < 0)
216 throw GooFit::GeneralError("The resolution device function index {} must be more than 0",
217 resolution->getDeviceFunction());
218 pindices.push_back(static_cast<unsigned int>(resolution->getDeviceFunction()));
220 // This is the start of reading in the amplitudes and adding the lineshapes and Spinfactors to this PDF
221 // This is done in this way so we don't have multiple copies of one lineshape in one pdf.
222 std::vector<unsigned int> ampidx;
223 std::vector<unsigned int> nPermVec;
224 std::vector<unsigned int> ampidxstart;
225 unsigned int coeff_counter = 0;
226 std::vector<Amplitude *> AmpBuffer;
228 std::vector<Amplitude *> AmpsA = decayInfo.amplitudes;
229 std::vector<Amplitude *> AmpsB = decayInfo.amplitudes_B;
231 for(auto &i : AmpsA) {
232 AmpMap[i->_uniqueDecayStr] = std::make_pair(std::vector<unsigned int>(0), std::vector<unsigned int>(0));
234 // printf("Adding Amplitde A:%s\n",AmpsA[i]->_uniqueDecayStr.c_str());
238 for(auto &LSIT : LSvec) {
239 auto found = std::find_if(components.begin(), components.end(), [&LSIT](const PdfBase *L) {
240 return (*LSIT) == *(dynamic_cast<const Lineshape *>(L));
243 if(found != components.end()) {
244 AmpMap[i->_uniqueDecayStr].first.push_back(std::distance(components.begin(), found));
245 // printf("LS %s found at %i\n",(*found)->getName().c_str(),std::distance(components.begin(), found));
247 components.push_back(LSIT);
248 AmpMap[i->_uniqueDecayStr].first.push_back(components.size() - 1);
249 // printf("Adding LS %s\n",(*LSIT)->getName().c_str());
255 for(auto &SFIT : SFvec) {
256 auto found = std::find_if(
257 SpinFactors.begin(), SpinFactors.end(), [&SFIT](const SpinFactor *S) { return (*SFIT) == (*S); });
259 if(found != SpinFactors.end()) {
260 AmpMap[i->_uniqueDecayStr].second.push_back(std::distance(SpinFactors.begin(), found));
261 // printf("SF %s found at %i\n",(*found)->getName().c_str(), std::distance(SpinFactors.begin(), found));
263 SpinFactors.push_back(SFIT);
264 AmpMap[i->_uniqueDecayStr].second.push_back(SpinFactors.size() - 1);
265 // printf("Adding SF %s\n",(*SFIT)->getName().c_str());
269 nPermVec.push_back(i->_nPerm);
270 pindices.push_back(registerParameter(i->_ar));
271 pindices.push_back(registerParameter(i->_ai));
273 AmpBuffer.push_back(i);
274 unsigned int flag = 0;
275 auto inB = std::find_if(AmpsB.begin(), AmpsB.end(), [AmpsA, &i](const Amplitude *A) { return *i == (*A); });
277 if(inB != AmpsB.end()) {
278 // printf("Found in AmpsB as well: %s\n", (*inB)->_uniqueDecayStr.c_str());
280 pindices.push_back(registerParameter((*inB)->_ar));
281 pindices.push_back(registerParameter((*inB)->_ai));
285 ampidxstart.push_back(ampidx.size());
286 std::vector<unsigned int> ls = AmpMap[i->_uniqueDecayStr].first;
287 std::vector<unsigned int> sf = AmpMap[i->_uniqueDecayStr].second;
288 ampidx.push_back(ls.size());
289 ampidx.push_back(sf.size());
290 ampidx.push_back(i->_nPerm);
291 ampidx.push_back(flag);
292 ampidx.insert(ampidx.end(), ls.begin(), ls.end());
293 ampidx.insert(ampidx.end(), sf.begin(), sf.end());
296 for(auto &i : AmpsB) {
297 unsigned int flag = 1;
299 = std::find_if(AmpBuffer.begin(), AmpBuffer.end(), [AmpsB, &i](const Amplitude *A) { return *i == (*A); });
301 if(inB != AmpBuffer.end())
304 AmpMap[i->_uniqueDecayStr] = std::make_pair(std::vector<unsigned int>(0), std::vector<unsigned int>(0));
305 // fprintf("Adding Amplitude B %s\n",AmpsB[i]->_uniqueDecayStr.c_str());
309 for(auto &LSIT : LSvec) {
310 auto found = std::find_if(components.begin(), components.end(), [&LSIT](const PdfBase *L) {
311 return (*LSIT) == *(dynamic_cast<const Lineshape *>(L));
314 if(found != components.end()) {
315 AmpMap[i->_uniqueDecayStr].first.push_back(std::distance(components.begin(), found));
316 // fprintf("LS %s found at %i\n",(*found)->getName().c_str(), std::distance(components.begin(), found));
318 components.push_back(LSIT);
319 AmpMap[i->_uniqueDecayStr].first.push_back(components.size() - 1);
320 // fprintf("Adding LS %s\n",(*LSIT)->getName().c_str());
326 for(auto &SFIT : SFvec) {
327 auto found = std::find_if(
328 SpinFactors.begin(), SpinFactors.end(), [&SFIT](const SpinFactor *S) { return (*SFIT) == (*S); });
330 if(found != SpinFactors.end()) {
331 AmpMap[i->_uniqueDecayStr].second.push_back(std::distance(SpinFactors.begin(), found));
332 // fprintf("SF %s found at %i\n",(*found)->getName().c_str(), std::distance(SpinFactors.begin(),
335 SpinFactors.push_back(SFIT);
336 AmpMap[i->_uniqueDecayStr].second.push_back(SpinFactors.size() - 1);
337 // fprintf("Adding SF %s\n",(*SFIT)->getName().c_str());
341 nPermVec.push_back(i->_nPerm);
342 pindices.push_back(registerParameter(i->_ar));
343 pindices.push_back(registerParameter(i->_ai));
345 ampidxstart.push_back(ampidx.size());
346 std::vector<unsigned int> ls = AmpMap[i->_uniqueDecayStr].first;
347 std::vector<unsigned int> sf = AmpMap[i->_uniqueDecayStr].second;
348 ampidx.push_back(ls.size());
349 ampidx.push_back(sf.size());
350 ampidx.push_back(i->_nPerm);
351 ampidx.push_back(flag);
352 ampidx.insert(ampidx.end(), ls.begin(), ls.end());
353 ampidx.insert(ampidx.end(), sf.begin(), sf.end());
357 AmpIndices, &(ampidxstart[0]), ampidxstart.size() * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
358 MEMCPY_TO_SYMBOL(AmpIndices,
360 ampidx.size() * sizeof(unsigned int),
361 ampidxstart.size() * sizeof(unsigned int),
362 cudaMemcpyHostToDevice);
364 pindices[2] = components.size();
365 pindices[3] = SpinFactors.size();
366 pindices[4] = AmpMap.size();
367 pindices[5] = coeff_counter;
369 for(auto &component : components) {
370 reinterpret_cast<Lineshape *>(component)->setConstantIndex(cIndex);
371 pindices.push_back(reinterpret_cast<Lineshape *>(component)->getFunctionIndex());
372 pindices.push_back(reinterpret_cast<Lineshape *>(component)->getParameterIndex());
375 for(auto &SpinFactor : SpinFactors) {
376 pindices.push_back(SpinFactor->getFunctionIndex());
377 pindices.push_back(SpinFactor->getParameterIndex());
378 SpinFactor->setConstantIndex(cIndex);
381 pindices.push_back(efficiency->getFunctionIndex());
382 pindices.push_back(efficiency->getParameterIndex());
383 components.push_back(efficiency);
385 // In case the resolution function needs parameters, this registers them.
386 resolution->createParameters(pindices, this);
387 GET_FUNCTION_ADDR(ptr_to_TDDP4);
388 initialize(pindices);
390 Integrator = new NormIntegrator_TD(parameters);
391 redoIntegral = new bool[components.size() - 1];
392 cachedMasses = new fptype[components.size() - 1];
393 cachedWidths = new fptype[components.size() - 1];
395 for(int i = 0; i < components.size() - 1; ++i) {
396 redoIntegral[i] = true;
397 cachedMasses[i] = -1;
398 cachedWidths[i] = -1;
399 lscalculators.push_back(new LSCalculator_TD(parameters, i));
402 for(int i = 0; i < SpinFactors.size(); ++i) {
403 sfcalculators.push_back(new SFCalculator_TD(parameters, i));
406 for(int i = 0; i < AmpMap.size(); ++i) {
407 AmpCalcs.push_back(new AmpCalc_TD(ampidxstart[i], parameters, nPermVec[i]));
410 // fprintf(stderr,"#Amp's %i, #LS %i, #SF %i \n", AmpMap.size(), components.size()-1, SpinFactors.size() );
412 std::vector<mcbooster::GReal_t> masses(decayInfo.particle_masses.begin() + 1, decayInfo.particle_masses.end());
413 mcbooster::PhaseSpace phsp(decayInfo.particle_masses[0], masses, MCeventsNorm, generation_offset);
414 phsp.Generate(mcbooster::Vector4R(decayInfo.particle_masses[0], 0.0, 0.0, 0.0));
417 auto nAcc = phsp.GetNAccepted();
418 mcbooster::BoolVector_d flags = phsp.GetAccRejFlags();
419 auto d1 = phsp.GetDaughters(0);
420 auto d2 = phsp.GetDaughters(1);
421 auto d3 = phsp.GetDaughters(2);
422 auto d4 = phsp.GetDaughters(3);
424 auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(d1.begin(), d2.begin(), d3.begin(), d4.begin()));
425 auto zip_end = zip_begin + d1.size();
426 auto new_end = thrust::remove_if(zip_begin, zip_end, flags.begin(), thrust::logical_not<bool>());
428 d1.erase(thrust::get<0>(new_end.get_iterator_tuple()), d1.end());
429 d2.erase(thrust::get<1>(new_end.get_iterator_tuple()), d2.end());
430 d3.erase(thrust::get<2>(new_end.get_iterator_tuple()), d3.end());
431 d4.erase(thrust::get<3>(new_end.get_iterator_tuple()), d4.end());
433 mcbooster::ParticlesSet_d pset(4);
439 norm_M12 = mcbooster::RealVector_d(nAcc);
440 norm_M34 = mcbooster::RealVector_d(nAcc);
441 norm_CosTheta12 = mcbooster::RealVector_d(nAcc);
442 norm_CosTheta34 = mcbooster::RealVector_d(nAcc);
443 norm_phi = mcbooster::RealVector_d(nAcc);
445 mcbooster::VariableSet_d VarSet(5);
446 VarSet[0] = &norm_M12;
447 VarSet[1] = &norm_M34;
448 VarSet[2] = &norm_CosTheta12;
449 VarSet[3] = &norm_CosTheta34;
450 VarSet[4] = &norm_phi;
453 mcbooster::EvaluateArray<Dim5>(eval, pset, VarSet);
455 norm_SF = mcbooster::RealVector_d(nAcc * SpinFactors.size());
456 norm_LS = mcbooster::mc_device_vector<fpcomplex>(nAcc * (components.size() - 1));
459 addSpecialMask(PdfBase::ForceSeparateNorm);
462 // makes the arrays to chache the lineshape values and spinfactors in CachedResSF and the values of the amplitudes in
464 // I made the choice to have spinfactors necxt to the values of the lineshape in memory. I waste memory by doing this
465 // because a spinfactor is saved as complex
466 // It would be nice to test if this is better than having the spinfactors stored seperately.
467 __host__ void TDDP4::setDataSize(unsigned int dataSize, unsigned int evtSize) {
468 // Default 3 is m12, m13, evtNum for DP 2dim, 4-body decay has 5 independent vars plus evtNum = 6
469 totalEventSize = evtSize;
470 if(totalEventSize < 3)
471 throw GooFit::GeneralError("totalEventSize {} must be 3 or more", totalEventSize);
479 numEntries = dataSize;
480 cachedResSF = new thrust::device_vector<fpcomplex>(
481 dataSize * (components.size() + SpinFactors.size() - 1)); // -1 because 1 component is efficiency
482 void *dummy = thrust::raw_pointer_cast(cachedResSF->data());
483 MEMCPY_TO_SYMBOL(cResSF_TD, &dummy, sizeof(fpcomplex *), cacheToUse * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
485 cachedAMPs = new thrust::device_vector<fpcomplex>(dataSize * (AmpCalcs.size()));
486 void *dummy2 = thrust::raw_pointer_cast(cachedAMPs->data());
487 MEMCPY_TO_SYMBOL(Amps_TD, &dummy2, sizeof(fpcomplex *), cacheToUse * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
492 // this is where the actual magic happens. This function does all the calculations!
493 __host__ fptype TDDP4::normalize() const {
494 if(cachedResSF == nullptr)
495 throw GeneralError("You must call dp.setDataSize(currData.getNumEvents(), N) first!");
497 // fprintf(stderr, "start normalize\n");
498 recursiveSetNormalisation(1); // Not going to normalize efficiency,
499 // so set normalisation factor to 1 so it doesn't get multiplied by zero.
500 // Copy at this time to ensure that the SpecialResonanceCalculators, which need the efficiency,
501 // don't get zeroes through multiplying by the normFactor.
502 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
504 // check if MINUIT changed any parameters and if so remember that so we know
505 // we need to recalculate that lineshape and every amp, that uses that lineshape
506 for(unsigned int i = 0; i < components.size() - 1; ++i) {
507 redoIntegral[i] = forceRedoIntegrals;
509 if(!(components[i]->parametersChanged()))
512 redoIntegral[i] = true;
515 SpinsCalculated = !forceRedoIntegrals;
516 forceRedoIntegrals = false;
518 // just some thrust iterators for the calculation.
519 thrust::constant_iterator<fptype *> dataArray(dev_event_array);
520 thrust::constant_iterator<int> eventSize(totalEventSize);
521 thrust::counting_iterator<int> eventIndex(0);
523 // Calculate spinfactors only once for normalisation events and real events
524 // strided_range is a template implemented in DalitsPlotHelpers.hh
525 // it basically goes through the array by increasing the pointer by a certain amount instead of just one step.
526 if(!SpinsCalculated) {
527 for(int i = 0; i < SpinFactors.size(); ++i) {
528 unsigned int offset = components.size() - 1;
530 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
531 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, dataArray, eventSize)),
532 strided_range<thrust::device_vector<fpcomplex>::iterator>(
533 cachedResSF->begin() + offset + i, cachedResSF->end(), (components.size() + SpinFactors.size() - 1))
535 *(sfcalculators[i]));
537 if(!generation_no_norm) {
539 thrust::make_zip_iterator(thrust::make_tuple(norm_M12.begin(),
541 norm_CosTheta12.begin(),
542 norm_CosTheta34.begin(),
544 thrust::make_zip_iterator(thrust::make_tuple(
545 norm_M12.end(), norm_M34.end(), norm_CosTheta12.end(), norm_CosTheta34.end(), norm_phi.end())),
546 (norm_SF.begin() + (i * MCevents)),
547 NormSpinCalculator_TD(parameters, i));
551 SpinsCalculated = true;
554 // fprintf(stderr, "normalize after spins\n");
556 // this calculates the values of the lineshapes and stores them in the array. It is recalculated every time
557 // parameters change.
558 for(int i = 0; i < components.size() - 1; ++i) {
559 if(redoIntegral[i]) {
561 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
562 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, dataArray, eventSize)),
563 strided_range<thrust::device_vector<fpcomplex>::iterator>(
564 cachedResSF->begin() + i, cachedResSF->end(), (components.size() + SpinFactors.size() - 1))
566 *(lscalculators[i]));
570 // fprintf(stderr, "normalize after LS\n");
572 // this is a little messy but it basically checks if the amplitude includes one of the recalculated lineshapes and
573 // if so recalculates that amplitude
574 auto AmpMapIt = AmpMap.begin();
576 for(int i = 0; i < AmpCalcs.size(); ++i) {
577 std::vector<unsigned int> redoidx((*AmpMapIt).second.first);
580 for(unsigned int j : redoidx) {
589 thrust::transform(eventIndex,
590 eventIndex + numEntries,
591 strided_range<thrust::device_vector<fpcomplex>::iterator>(
592 cachedAMPs->begin() + i, cachedAMPs->end(), AmpCalcs.size())
598 // fprintf(stderr, "normalize after Amps\n");
600 // lineshape value calculation for the normalisation, also recalculated every time parameter change
601 if(!generation_no_norm) {
602 for(int i = 0; i < components.size() - 1; ++i) {
607 thrust::make_zip_iterator(thrust::make_tuple(norm_M12.begin(),
609 norm_CosTheta12.begin(),
610 norm_CosTheta34.begin(),
612 thrust::make_zip_iterator(thrust::make_tuple(
613 norm_M12.end(), norm_M34.end(), norm_CosTheta12.end(), norm_CosTheta34.end(), norm_phi.end())),
614 (norm_LS.begin() + (i * MCevents)),
615 NormLSCalculator_TD(parameters, i));
619 thrust::constant_iterator<fptype *> normSFaddress(thrust::raw_pointer_cast(norm_SF.data()));
620 thrust::constant_iterator<fpcomplex *> normLSaddress(thrust::raw_pointer_cast(norm_LS.data()));
621 thrust::constant_iterator<int> NumNormEvents(MCevents);
623 // this does the rest of the integration with the cached lineshape and spinfactor values for the normalization
627 if(!generation_no_norm) {
628 thrust::tuple<fptype, fptype, fptype, fptype> dummy(0, 0, 0, 0);
629 FourDblTupleAdd MyFourDoubleTupleAdditionFunctor;
630 thrust::tuple<fptype, fptype, fptype, fptype> sumIntegral;
631 sumIntegral = thrust::transform_reduce(
632 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, NumNormEvents, normSFaddress, normLSaddress)),
633 thrust::make_zip_iterator(
634 thrust::make_tuple(eventIndex + MCevents, NumNormEvents, normSFaddress, normLSaddress)),
637 MyFourDoubleTupleAdditionFunctor);
639 // printf("normalize A2/#evts , B2/#evts: %.5g, %.5g\n",thrust::get<0>(sumIntegral)/MCevents,
640 // thrust::get<1>(sumIntegral)/MCevents);
641 fptype tau = host_params[host_indices[parameters + 7]];
642 fptype xmixing = host_params[host_indices[parameters + 8]];
643 fptype ymixing = host_params[host_indices[parameters + 9]];
645 ret = resolution->normalisation(thrust::get<0>(sumIntegral),
646 thrust::get<1>(sumIntegral),
647 thrust::get<2>(sumIntegral),
648 thrust::get<3>(sumIntegral),
653 // MCevents is the number of normalisation events.
657 host_normalisation[parameters] = 1.0 / ret;
658 // printf("end of normalize %f\n", ret);
663 std::tuple<mcbooster::ParticlesSet_h, mcbooster::VariableSet_h, mcbooster::RealVector_h, mcbooster::BoolVector_h>
664 TDDP4::GenerateSig(unsigned int numEvents) {
667 std::vector<mcbooster::GReal_t> masses(decayInfo.particle_masses.begin() + 1, decayInfo.particle_masses.end());
668 mcbooster::PhaseSpace phsp(decayInfo.particle_masses[0], masses, numEvents, generation_offset);
669 phsp.Generate(mcbooster::Vector4R(decayInfo.particle_masses[0], 0.0, 0.0, 0.0));
673 auto nAcc = phsp.GetNAccepted();
674 mcbooster::BoolVector_d flags = phsp.GetAccRejFlags();
675 auto d1 = phsp.GetDaughters(0);
676 auto d2 = phsp.GetDaughters(1);
677 auto d3 = phsp.GetDaughters(2);
678 auto d4 = phsp.GetDaughters(3);
680 auto zip_begin = thrust::make_zip_iterator(thrust::make_tuple(d1.begin(), d2.begin(), d3.begin(), d4.begin()));
681 auto zip_end = zip_begin + d1.size();
682 auto new_end = thrust::remove_if(zip_begin, zip_end, flags.begin(), thrust::logical_not<bool>());
684 flags = mcbooster::BoolVector_d();
686 d1.erase(thrust::get<0>(new_end.get_iterator_tuple()), d1.end());
687 d2.erase(thrust::get<1>(new_end.get_iterator_tuple()), d2.end());
688 d3.erase(thrust::get<2>(new_end.get_iterator_tuple()), d3.end());
689 d4.erase(thrust::get<3>(new_end.get_iterator_tuple()), d4.end());
691 mcbooster::ParticlesSet_d pset(4);
697 auto SigGen_M12_d = mcbooster::RealVector_d(nAcc);
698 auto SigGen_M34_d = mcbooster::RealVector_d(nAcc);
699 auto SigGen_CosTheta12_d = mcbooster::RealVector_d(nAcc);
700 auto SigGen_CosTheta34_d = mcbooster::RealVector_d(nAcc);
701 auto SigGen_phi_d = mcbooster::RealVector_d(nAcc);
702 auto dtime_d = mcbooster::RealVector_d(nAcc);
704 thrust::counting_iterator<unsigned int> index_sequence_begin(0);
706 fptype tau = host_params[host_indices[parameters + 7]];
707 fptype ymixing = host_params[host_indices[parameters + 9]];
708 fptype gammamin = 1.0 / tau - fabs(ymixing) / tau;
709 /*printf("hostparams: %f, %f", tau, ymixing);*/
712 index_sequence_begin, index_sequence_begin + nAcc, dtime_d.begin(), genExp(generation_offset, gammamin));
714 mcbooster::VariableSet_d VarSet_d(5);
715 VarSet_d[0] = &SigGen_M12_d;
716 VarSet_d[1] = &SigGen_M34_d;
717 VarSet_d[2] = &SigGen_CosTheta12_d;
718 VarSet_d[3] = &SigGen_CosTheta34_d;
719 VarSet_d[4] = &SigGen_phi_d;
722 mcbooster::EvaluateArray<Dim5>(eval, pset, VarSet_d);
724 auto h1 = new mcbooster::Particles_h(d1);
725 auto h2 = new mcbooster::Particles_h(d2);
726 auto h3 = new mcbooster::Particles_h(d3);
727 auto h4 = new mcbooster::Particles_h(d4);
729 mcbooster::ParticlesSet_h ParSet(4);
735 auto SigGen_M12_h = new mcbooster::RealVector_h(SigGen_M12_d);
736 auto SigGen_M34_h = new mcbooster::RealVector_h(SigGen_M34_d);
737 auto SigGen_CosTheta12_h = new mcbooster::RealVector_h(SigGen_CosTheta12_d);
738 auto SigGen_CosTheta34_h = new mcbooster::RealVector_h(SigGen_CosTheta34_d);
739 auto SigGen_phi_h = new mcbooster::RealVector_h(SigGen_phi_d);
740 auto dtime_h = new mcbooster::RealVector_h(dtime_d);
742 mcbooster::VariableSet_h VarSet(6);
743 VarSet[0] = SigGen_M12_h;
744 VarSet[1] = SigGen_M34_h;
745 VarSet[2] = SigGen_CosTheta12_h;
746 VarSet[3] = SigGen_CosTheta34_h;
747 VarSet[4] = SigGen_phi_h;
752 auto DS = new mcbooster::RealVector_d(8 * nAcc);
753 thrust::counting_iterator<int> eventNumber(0);
757 for(int i = 0; i < 5; ++i) {
758 mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr(DS->begin() + i, DS->end(), 8);
759 thrust::copy(VarSet_d[i]->begin(), VarSet_d[i]->end(), sr.begin());
762 mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr(DS->begin() + 5, DS->end(), 8);
763 thrust::copy(eventNumber, eventNumber + nAcc, sr.begin());
765 mcbooster::strided_range<mcbooster::RealVector_d::iterator> sr2(DS->begin() + 6, DS->end(), 8);
766 thrust::fill_n(sr2.begin(), nAcc, 0);
768 dev_event_array = thrust::raw_pointer_cast(DS->data());
769 setDataSize(nAcc, 8);
771 generation_no_norm = true; // we need no normalization for generation, but we do need to make sure that norm = 1;
775 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
777 thrust::device_vector<fptype> weights(nAcc);
778 thrust::constant_iterator<int> eventSize(8);
779 thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
780 thrust::counting_iterator<int> eventIndex(0);
782 MetricTaker evalor(this, getMetricPointer("ptr_to_Prob"));
783 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
784 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, arrayAddress, eventSize)),
787 cudaDeviceSynchronize();
789 fptype wmax = 1.1 * (fptype)*thrust::max_element(weights.begin(), weights.end());
791 if(wmax > maxWeight && maxWeight != 0)
793 "WARNING: you just encountered a higher maximum weight than observed in previous iterations.\n"
794 "WARNING: Consider recalculating your AccRej flags and acceping based upon these.\n"
795 "WARNING: previous weight: %.4g, new weight: %.4g\n",
799 maxWeight = wmax > maxWeight ? wmax : maxWeight;
801 thrust::copy(dtime_d.begin(), dtime_d.end(), sr2.begin());
803 dtime_d = mcbooster::RealVector_d();
804 thrust::device_vector<fptype> results(nAcc);
806 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
807 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, arrayAddress, eventSize)),
811 cudaDeviceSynchronize();
813 thrust::device_vector<fptype> flag2(nAcc);
814 thrust::counting_iterator<mcbooster::GLong_t> first(0);
815 // thrust::counting_iterator<mcbooster::GLong_t> last = first + nAcc;
817 // we do not want to copy the whole class to the GPU so capturing *this is not a great option
818 // therefore perpare local copies to capture the variables we need
819 unsigned int tmpoff = generation_offset;
820 unsigned int tmpparam = parameters;
824 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, results.begin(), arrayAddress, eventSize)),
825 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + nAcc, results.end(), arrayAddress, eventSize)),
827 exp_functor(tmpparam, tmpoff, gammamin, wmax));
829 gooFree(dev_event_array);
831 /*printf("Offset: %i und wmax:%.5g\n",generation_offset, wmax );*/
833 auto weights_h = mcbooster::RealVector_h(weights);
834 auto results_h = mcbooster::RealVector_h(results);
835 auto flags_h = mcbooster::BoolVector_h(flag2);
836 cudaDeviceSynchronize();
838 return std::make_tuple(ParSet, VarSet, weights_h, flags_h);
841 SFCalculator_TD::SFCalculator_TD(int pIdx, unsigned int sf_idx)
842 : _spinfactor_i(sf_idx)
843 , _parameters(pIdx) {}
845 __device__ fpcomplex SFCalculator_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
846 int evtNum = thrust::get<0>(t);
847 fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
849 unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
850 int parameter_i = 12 + (2 * indices[6]) + (indices[3] * 2)
851 + (_spinfactor_i * 2); // Find position of this resonance relative to DALITZPLOT start
852 unsigned int functn_i = indices[parameter_i];
853 unsigned int params_i = indices[parameter_i + 1];
855 fptype m12 = evt[indices[2 + indices[0]]];
856 fptype m34 = evt[indices[3 + indices[0]]];
857 fptype cos12 = evt[indices[4 + indices[0]]];
858 fptype cos34 = evt[indices[5 + indices[0]]];
859 fptype phi = evt[indices[6 + indices[0]]];
862 get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
863 // printf("%i, %i, %f, %f, %f, %f, %f \n",evtNum, thrust::get<2>(t), m12, m34, cos12, cos34, phi );
864 // printf("vec%i %f, %f, %f, %f\n",0, vecs[0], vecs[1], vecs[2], vecs[3]);
865 // printf("vec%i %f, %f, %f, %f\n",1, vecs[4], vecs[5], vecs[6], vecs[7]);
866 // printf("vec%i %f, %f, %f, %f\n",2, vecs[8], vecs[9], vecs[10], vecs[11]);
867 // printf("vec%i %f, %f, %f, %f\n",3, vecs[12], vecs[13], vecs[14], vecs[15]);
869 auto func = reinterpret_cast<spin_function_ptr>(device_function_table[functn_i]);
870 fptype sf = (*func)(vecs, paramIndices + params_i);
871 // printf("SpinFactors %i : %.7g\n",_spinfactor_i, sf );
875 NormSpinCalculator_TD::NormSpinCalculator_TD(int pIdx, unsigned int sf_idx)
876 : _spinfactor_i(sf_idx)
877 , _parameters(pIdx) {}
879 __device__ fptype NormSpinCalculator_TD::operator()(
880 thrust::tuple<mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t> t)
882 unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
883 int parameter_i = 12 + (2 * indices[6]) + (indices[3] * 2)
884 + (_spinfactor_i * 2); // Find position of this resonance relative to DALITZPLOT start
885 unsigned int functn_i = indices[parameter_i];
886 unsigned int params_i = indices[parameter_i + 1];
888 fptype m12 = (thrust::get<0>(t));
889 fptype m34 = (thrust::get<1>(t));
890 fptype cos12 = (thrust::get<2>(t));
891 fptype cos34 = (thrust::get<3>(t));
892 fptype phi = (thrust::get<4>(t));
895 get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
897 // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,0, vecs[0], vecs[1], vecs[2], vecs[3]);
898 // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,1, vecs[4], vecs[5], vecs[6], vecs[7]);
899 // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,2, vecs[8], vecs[9], vecs[10], vecs[11]);
900 // printf("evt %i vec%i %.5g, %.5g, %.5g, %.5g\n", evtNum,3, vecs[12], vecs[13], vecs[14], vecs[15]);
902 auto func = reinterpret_cast<spin_function_ptr>(device_function_table[functn_i]);
903 fptype sf = (*func)(vecs, paramIndices + params_i);
905 // printf("NormSF evt:%.5g, %.5g, %.5g, %.5g, %.5g\n", m12, m34, cos12, cos34, phi);
906 // printf("NormSF %i, %.7g\n",_spinfactor_i, sf );
911 LSCalculator_TD::LSCalculator_TD(int pIdx, unsigned int res_idx)
912 : _resonance_i(res_idx)
913 , _parameters(pIdx) {}
915 __device__ fpcomplex LSCalculator_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
916 // Calculates the BW values for a specific resonance.
919 int evtNum = thrust::get<0>(t);
920 fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
922 unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
924 = 12 + (2 * indices[6]) + (_resonance_i * 2); // Find position of this resonance relative to DALITZPLOT start
925 unsigned int functn_i = indices[parameter_i];
926 unsigned int params_i = indices[parameter_i + 1];
927 unsigned int pair = (paramIndices + params_i)[5];
929 fptype m1 = functorConstants[indices[1] + 2];
930 fptype m2 = functorConstants[indices[1] + 3];
931 fptype m3 = functorConstants[indices[1] + 4];
932 fptype m4 = functorConstants[indices[1] + 5];
934 fptype m12 = evt[indices[2 + indices[0]]];
935 fptype m34 = evt[indices[3 + indices[0]]];
936 fptype cos12 = evt[indices[4 + indices[0]]];
937 fptype cos34 = evt[indices[5 + indices[0]]];
938 fptype phi = evt[indices[6 + indices[0]]];
941 fptype mres = pair == 0 ? m12 : m34;
942 fptype d1 = pair == 0 ? m1 : m3;
943 fptype d2 = pair == 0 ? m2 : m4;
944 ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
945 // printf("LS_nt %i: mass:%f, %f i%f\n",_resonance_i, mres, ret.real, ret.imag );
948 get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
950 fptype mres = getmass(pair, d1, d2, vecs, m1, m2, m3, m4);
951 ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
952 // printf("LS %i: mass:%f, %f i%f\n",_resonance_i, mres, ret.real, ret.imag );
955 // printf("LS %i: %.7g, %.7g, %.7g, %.7g, %.7g \n",evtNum, m12, m34, cos12, cos34, phi );
957 // if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return ret;
958 // printf("m12 %f \n", m12); // %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
959 // printf("#Parameters %i, #LS %i, #SF %i, #AMP %i \n", indices[0], indices[3], indices[4], indices[5]);
960 // printf("BW_%i : %f %f\n", _resonance_i, ret.real, ret.imag);
965 NormLSCalculator_TD::NormLSCalculator_TD(int pIdx, unsigned int res_idx)
966 : _resonance_i(res_idx)
967 , _parameters(pIdx) {}
969 __device__ fpcomplex NormLSCalculator_TD::operator()(
970 thrust::tuple<mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t, mcbooster::GReal_t> t)
972 // Calculates the BW values for a specific resonance.
975 unsigned int *indices = paramIndices + _parameters; // Jump to DALITZPLOT position within parameters array
977 = 12 + (2 * indices[6]) + (_resonance_i * 2); // Find position of this resonance relative to DALITZPLOT start
978 unsigned int functn_i = indices[parameter_i];
979 unsigned int params_i = indices[parameter_i + 1];
980 unsigned int pair = (paramIndices + params_i)[5];
982 fptype m1 = functorConstants[indices[1] + 2];
983 fptype m2 = functorConstants[indices[1] + 3];
984 fptype m3 = functorConstants[indices[1] + 4];
985 fptype m4 = functorConstants[indices[1] + 5];
987 fptype m12 = (thrust::get<0>(t));
988 fptype m34 = (thrust::get<1>(t));
989 fptype cos12 = (thrust::get<2>(t));
990 fptype cos34 = (thrust::get<3>(t));
991 fptype phi = (thrust::get<4>(t));
994 fptype mres = pair == 0 ? m12 : m34;
995 fptype d1 = pair == 0 ? m1 : m3;
996 fptype d2 = pair == 0 ? m2 : m4;
997 ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
1000 get4Vecs(vecs, indices[1], m12, m34, cos12, cos34, phi);
1002 fptype mres = getmass(pair, d1, d2, vecs, m1, m2, m3, m4);
1003 ret = getResonanceAmplitude(mres, d1, d2, functn_i, params_i);
1006 // printf("NormLS %f, %f, %f, %f, %f \n",m12, m34, cos12, cos34, phi );
1007 // printf("%i, %i, %i, %i, %i \n",numLS, numSF, numAmps, offset, evtNum );
1008 // printf("NLS %i, %f, %f\n",_resonance_i,ret.real, ret.imag);
1010 // printf("m12 %f \n", m12); // %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
1011 // printf("#Parameters %i, #LS %i, #SF %i, #AMP %i \n", indices[0], indices[3], indices[4], indices[5]);
1016 AmpCalc_TD::AmpCalc_TD(unsigned int AmpIdx, unsigned int pIdx, unsigned int nPerm)
1019 , _parameters(pIdx) {}
1021 __device__ fpcomplex AmpCalc_TD::operator()(thrust::tuple<int, fptype *, int> t) const {
1022 unsigned int *indices = paramIndices + _parameters;
1023 unsigned int cacheToUse = indices[2];
1024 unsigned int totalLS = indices[3];
1025 unsigned int totalSF = indices[4];
1026 unsigned int totalAMP = indices[5];
1027 unsigned int offset = totalLS + totalSF;
1028 unsigned int numLS = AmpIndices[totalAMP + _AmpIdx];
1029 unsigned int numSF = AmpIndices[totalAMP + _AmpIdx + 1];
1030 unsigned int evtNum = thrust::get<0>(t);
1032 fpcomplex returnVal(0, 0);
1033 unsigned int SF_step = numSF / _nPerm;
1034 unsigned int LS_step = numLS / _nPerm;
1036 for(int i = 0; i < _nPerm; ++i) {
1037 fpcomplex ret(1, 0);
1039 for(int j = i * LS_step; j < (i + 1) * LS_step; ++j) {
1040 ret *= (cResSF_TD[cacheToUse][evtNum * offset + AmpIndices[totalAMP + _AmpIdx + 4 + j]]);
1041 // printf("Lineshape %i = (%.7g, %.7g)\n", j, (cResSF_TD[cacheToUse][evtNum*offset + AmpIndices[totalAMP +
1042 // _AmpIdx + 4 + j]]).real, (cResSF_TD[cacheToUse][evtNum*offset + AmpIndices[totalAMP + _AmpIdx + 4 +
1046 // printf("Lineshape Product = (%.7g, %.7g)\n", ret.real, ret.imag);
1047 for(int j = i * SF_step; j < (i + 1) * SF_step; ++j) {
1048 ret *= (cResSF_TD[cacheToUse][evtNum * offset + totalLS + AmpIndices[totalAMP + _AmpIdx + 4 + numLS + j]]
1050 // printf(" SF = %.7g\n", (cResSF_TD[cacheToUse][evtNum*offset + totalLS + AmpIndices[totalAMP + _AmpIdx + 4
1051 // + numLS + j]].real));
1054 // printf("Lineshape Product * SF = (%.7g, %.7g)\n", ret.real, ret.imag);
1059 returnVal *= (1 / sqrt(static_cast<fptype>(_nPerm)));
1060 // printf("Amplitude Value = (%.7g, %.7g)\n", returnVal.real, returnVal.imag);
1064 NormIntegrator_TD::NormIntegrator_TD(unsigned int pIdx)
1065 : _parameters(pIdx) {}
1067 __device__ thrust::tuple<fptype, fptype, fptype, fptype> NormIntegrator_TD::
1068 operator()(thrust::tuple<int, int, fptype *, fpcomplex *> t) const {
1069 unsigned int *indices = paramIndices + _parameters;
1070 unsigned int totalAMP = indices[5];
1072 unsigned int evtNum = thrust::get<0>(t);
1073 unsigned int MCevents = thrust::get<1>(t);
1074 fptype *SFnorm = thrust::get<2>(t) + evtNum;
1075 fpcomplex *LSnorm = thrust::get<3>(t) + evtNum;
1077 fpcomplex AmpA(0, 0);
1078 fpcomplex AmpB(0, 0);
1079 fpcomplex amp_A, amp_B;
1083 for(int amp = 0; amp < totalAMP; ++amp) {
1084 unsigned int ampidx = AmpIndices[amp];
1085 unsigned int numLS = AmpIndices[totalAMP + ampidx];
1086 unsigned int numSF = AmpIndices[totalAMP + ampidx + 1];
1087 unsigned int nPerm = AmpIndices[totalAMP + ampidx + 2];
1088 unsigned int flag = AmpIndices[totalAMP + ampidx + 3];
1089 unsigned int SF_step = numSF / nPerm;
1090 unsigned int LS_step = numLS / nPerm;
1091 fpcomplex ret2(0, 0);
1092 // printf("%i, %i, %i, %i, %i, %i, %i, %i, %i, %f\n",ampidx, amp, numLS, numSF, nPerm,AmpIndices[totalAMP +
1093 // ampidx + 4 + 0], AmpIndices[totalAMP + ampidx + 4 + 1], AmpIndices[totalAMP + ampidx + 4 + 2],
1094 // AmpIndices[totalAMP + ampidx + 4 + 3], (1/sqrt((fptype)(nPerm))) );
1096 for(int j = 0; j < nPerm; ++j) {
1097 fpcomplex ret(1, 0);
1099 for(int i = j * LS_step; i < (j + 1) * LS_step; ++i) {
1100 fpcomplex matrixelement(LSnorm[AmpIndices[totalAMP + ampidx + 4 + i] * MCevents]);
1101 // printf("Norm BW %i, %.5g, %.5g\n",AmpIndices[totalAMP + ampidx + 4 + i] , matrixelement.real,
1102 // matrixelement.imag);
1103 ret *= matrixelement;
1106 for(int i = j * SF_step; i < (j + 1) * SF_step; ++i) {
1107 fptype matrixelement = (SFnorm[AmpIndices[totalAMP + ampidx + 4 + numLS + i] * MCevents]);
1108 // printf("Norm SF %i, %.5g\n",AmpIndices[totalAMP + ampidx + 4 + i] , matrixelement);
1109 ret *= matrixelement;
1115 ret2 *= (1 / sqrt(static_cast<fptype>(nPerm)));
1116 // printf("Result Amplitude %i, %i, %.5g, %.5g\n",flag, amp, ret2.real, ret2.imag);
1120 amp_A = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1121 AmpA += ret2 * amp_A;
1125 amp_B = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1126 AmpB += ret2 * amp_B;
1130 amp_A = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1131 AmpA += ret2 * amp_A;
1133 amp_B = fpcomplex(cudaArray[indices[12 + 2 * (amp + k)]], cudaArray[indices[13 + 2 * (amp + k)]]);
1134 AmpB += ret2 * amp_B;
1139 fptype _SqWStoRSrate = cudaArray[indices[10]];
1140 AmpA *= _SqWStoRSrate;
1142 auto AmpAB = AmpA * conj(AmpB);
1143 return {thrust::norm(AmpA), thrust::norm(AmpB), AmpAB.real(), AmpAB.imag()};
1146 } // namespace GooFit