1 #include <goofit/Error.h>
     2 #include <goofit/PDFs/physics/DalitzPlotPdf.h>
     3 #include <goofit/detail/Complex.h>
     5 #include <thrust/copy.h>
     6 #include <thrust/transform_reduce.h>
    10 // Functor used for fit fraction sum
    11 struct CoefSumFunctor {
    15     CoefSumFunctor(fpcomplex coef_i, fpcomplex coef_j)
    19     __device__ fptype operator()(thrust::tuple<fpcomplex, fpcomplex> val) {
    20         return (coef_i * thrust::conj<fptype>(coef_j) * thrust::get<0>(val) * thrust::conj<fptype>(thrust::get<1>(val)))
    25 constexpr int resonanceOffset_DP = 4; // Offset of the first resonance into the parameter index array
    26 // Offset is number of parameters, constant index, number of resonances (not calculable
    27 // from nP because we don't know what the efficiency might need), and cache index. Efficiency
    28 // parameters are after the resonance information.
    30 // The function of this array is to hold all the cached waves; specific
    31 // waves are recalculated when the corresponding resonance mass or width
    32 // changes. Note that in a multithread environment each thread needs its
    33 // own cache, hence the '10'. Ten threads should be enough for anyone!
    35 // NOTE: This is does not support ten instances (ten threads) of resoncances now, only one set of resonances.
    36 __device__ fpcomplex *cResonances[16];
    38 __device__ inline int parIndexFromResIndex_DP(int resIndex) { return resonanceOffset_DP + resIndex * resonanceSize; }
    41 device_DalitzPlot_calcIntegrals(fptype m12, fptype m13, int res_i, int res_j, fptype *p, unsigned int *indices) {
    42     // Calculates BW_i(m12, m13) * BW_j^*(m12, m13).
    43     // This calculation is in a separate function so
    44     // it can be cached. Note that this function expects
    45     // to be called on a normalisation grid, not on
    46     // observed points, that's why it doesn't use
    47     // cResonances. No need to cache the values at individual
    48     // grid points - we only care about totals.
    49     fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
    50     fptype daug1Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
    51     fptype daug2Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
    52     fptype daug3Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
    54     fpcomplex ret{0., 0.};
    56     if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
    60         = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
    62     int parameter_i       = parIndexFromResIndex_DP(res_i);
    63     unsigned int functn_i = RO_CACHE(indices[parameter_i + 2]);
    64     unsigned int params_i = RO_CACHE(indices[parameter_i + 3]);
    65     ret                   = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
    67     int parameter_j       = parIndexFromResIndex_DP(res_j);
    68     unsigned int functn_j = RO_CACHE(indices[parameter_j + 2]);
    69     unsigned int params_j = RO_CACHE(indices[parameter_j + 3]);
    70     ret *= conj(getResonanceAmplitude(m12, m13, m23, functn_j, params_j));
    75 __device__ fptype device_DalitzPlot(fptype *evt, fptype *p, unsigned int *indices) {
    76     fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
    77     fptype daug1Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
    78     fptype daug2Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
    79     fptype daug3Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);
    81     fptype m12 = RO_CACHE(evt[RO_CACHE(indices[2 + RO_CACHE(indices[0])])]);
    82     fptype m13 = RO_CACHE(evt[RO_CACHE(indices[3 + RO_CACHE(indices[0])])]);
    84     if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
    87     fptype evtIndex = RO_CACHE(evt[RO_CACHE(indices[4 + RO_CACHE(indices[0])])]);
    89     auto evtNum = static_cast<int>(floor(0.5 + evtIndex));
    91     fpcomplex totalAmp(0, 0);
    92     unsigned int numResonances = RO_CACHE(indices[2]);
    93     // unsigned int cacheToUse    = RO_CACHE(indices[3]);
    95     for(int i = 0; i < numResonances; ++i) {
    96         int paramIndex = parIndexFromResIndex_DP(i);
    98             = fpcomplex(RO_CACHE(p[RO_CACHE(indices[paramIndex + 0])]), RO_CACHE(p[RO_CACHE(indices[paramIndex + 1])]));
   100         // potential performance improvement by
   101         // double2 me = RO_CACHE(reinterpret_cast<double2*> (cResonances[i][evtNum]));
   102         fpcomplex me = RO_CACHE(cResonances[i][evtNum]);
   104         // fpcomplex matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
   105         //                  (cResonances[cacheToUse][evtNum*numResonances + i]).imag);
   106         // fpcomplex matrixelement (me[0], me[1]);
   108         totalAmp += amp * me;
   111     fptype ret         = thrust::norm(totalAmp);
   112     int effFunctionIdx = parIndexFromResIndex_DP(numResonances);
   113     fptype eff         = callFunction(evt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1]));
   116     // printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real,
   122 __device__ device_function_ptr ptr_to_DalitzPlot = device_DalitzPlot;
   124 __host__ DalitzPlotPdf::DalitzPlotPdf(
   125     std::string n, Observable m12, Observable m13, EventNumber eventNumber, DecayInfo3 decay, GooPdf *efficiency)
   126     : GooPdf(n, m12, m13, eventNumber)
   130     , _eventNumber(eventNumber)
   131     , dalitzNormRange(nullptr)
   134     , forceRedoIntegrals(true)
   135     , totalEventSize(3) // Default 3 = m12, m13, evtNum
   137     , integrators(nullptr)
   138     , calculators(nullptr) {
   139     fptype decayConstants[5];
   141     for(auto &cachedWave : cachedWaves)
   142         cachedWave = nullptr;
   144     std::vector<unsigned int> pindices;
   145     pindices.push_back(registerConstants(5));
   146     decayConstants[0] = decayInfo.motherMass;
   147     decayConstants[1] = decayInfo.daug1Mass;
   148     decayConstants[2] = decayInfo.daug2Mass;
   149     decayConstants[3] = decayInfo.daug3Mass;
   150     decayConstants[4] = decayInfo.meson_radius;
   152         functorConstants, decayConstants, 5 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
   154     pindices.push_back(decayInfo.resonances.size());
   155     static int cacheCount = 0;
   156     cacheToUse            = cacheCount++;
   157     pindices.push_back(cacheToUse);
   159     for(auto &resonance : decayInfo.resonances) {
   160         pindices.push_back(registerParameter(resonance->amp_real));
   161         pindices.push_back(registerParameter(resonance->amp_imag));
   162         pindices.push_back(resonance->getFunctionIndex());
   163         pindices.push_back(resonance->getParameterIndex());
   164         resonance->setConstantIndex(cIndex);
   165         components.push_back(resonance);
   168     pindices.push_back(efficiency->getFunctionIndex());
   169     pindices.push_back(efficiency->getParameterIndex());
   170     components.push_back(efficiency);
   172     GET_FUNCTION_ADDR(ptr_to_DalitzPlot);
   173     initialize(pindices);
   175     redoIntegral = new bool[decayInfo.resonances.size()];
   176     cachedMasses = new fptype[decayInfo.resonances.size()];
   177     cachedWidths = new fptype[decayInfo.resonances.size()];
   178     integrals    = new fpcomplex **[decayInfo.resonances.size()];
   179     integrators  = new SpecialResonanceIntegrator **[decayInfo.resonances.size()];
   180     calculators  = new SpecialResonanceCalculator *[decayInfo.resonances.size()];
   182     for(int i = 0; i < decayInfo.resonances.size(); ++i) {
   183         redoIntegral[i] = true;
   184         cachedMasses[i] = -1;
   185         cachedWidths[i] = -1;
   186         integrators[i]  = new SpecialResonanceIntegrator *[decayInfo.resonances.size()];
   187         calculators[i]  = new SpecialResonanceCalculator(parameters, i);
   188         integrals[i]    = new fpcomplex *[decayInfo.resonances.size()];
   190         for(int j = 0; j < decayInfo.resonances.size(); ++j) {
   191             integrals[i][j]   = new fpcomplex(0, 0);
   192             integrators[i][j] = new SpecialResonanceIntegrator(parameters, i, j);
   196     addSpecialMask(PdfBase::ForceSeparateNorm);
   199 __host__ void DalitzPlotPdf::setDataSize(unsigned int dataSize, unsigned int evtSize) {
   200     // Default 3 is m12, m13, evtNum
   201     totalEventSize = evtSize;
   202     if(totalEventSize < 3)
   203         throw GooFit::GeneralError("totalEventSize {} must be 3 or more", totalEventSize);
   205     // if (cachedWaves) delete cachedWaves;
   207         for(auto &cachedWave : cachedWaves) {
   209             cachedWave = nullptr;
   213     numEntries = dataSize;
   215     for(int i = 0; i < 16; i++) {
   217         cachedWaves[i] = new thrust::device_vector<fpcomplex>(m_iEventsPerTask);
   219         cachedWaves[i] = new thrust::device_vector<fpcomplex>(dataSize);
   221         void *dummy = thrust::raw_pointer_cast(cachedWaves[i]->data());
   222         MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(fpcomplex *), i * sizeof(fpcomplex *), cudaMemcpyHostToDevice);
   228 __host__ fptype DalitzPlotPdf::normalize() const {
   229     recursiveSetNormalisation(1); // Not going to normalize efficiency,
   230     // so set normalisation factor to 1 so it doesn't get multiplied by zero.
   231     // Copy at this time to ensure that the SpecialResonanceCalculators, which need the efficiency,
   232     // don't get zeroes through multiplying by the normFactor.
   233     MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
   235     int totalBins = _m12.getNumBins() * _m13.getNumBins();
   237     if(!dalitzNormRange) {
   238         gooMalloc((void **)&dalitzNormRange, 6 * sizeof(fptype));
   241     // This line runs once
   242     static std::array<fptype, 6> host_norms{{0, 0, 0, 0, 0, 0}};
   244     std::array<fptype, 6> current_host_norms{{_m12.getLowerLimit(),
   245                                               _m12.getUpperLimit(),
   246                                               static_cast<fptype>(_m12.getNumBins()),
   247                                               _m13.getLowerLimit(),
   248                                               _m13.getUpperLimit(),
   249                                               static_cast<fptype>(_m13.getNumBins())}};
   251     if(host_norms != current_host_norms) {
   252         host_norms = current_host_norms;
   253         MEMCPY(dalitzNormRange, host_norms.data(), 6 * sizeof(fptype), cudaMemcpyHostToDevice);
   256     for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
   257         redoIntegral[i] = forceRedoIntegrals;
   259         if(!(decayInfo.resonances[i]->parametersChanged()))
   262         redoIntegral[i] = true;
   265     forceRedoIntegrals = false;
   267     // Only do this bit if masses or widths have changed.
   268     thrust::constant_iterator<fptype *> arrayAddress(dalitzNormRange);
   269     thrust::counting_iterator<int> binIndex(0);
   271     // NB, SpecialResonanceCalculator assumes that fit is unbinned!
   272     // And it needs to know the total event size, not just observables
   273     // for this particular PDF component.
   274     thrust::constant_iterator<fptype *> dataArray(dev_event_array);
   275     thrust::constant_iterator<int> eventSize(totalEventSize);
   276     thrust::counting_iterator<int> eventIndex(0);
   278     for(int i = 0; i < decayInfo.resonances.size(); ++i) {
   279         if(redoIntegral[i]) {
   282                 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
   283                 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
   284                 strided_range<thrust::device_vector<fpcomplex>::iterator>(
   285                     cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
   290                 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
   291                 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
   292                 strided_range<thrust::device_vector<fpcomplex>::iterator>(
   293                     cachedWaves[i]->begin(), cachedWaves[i]->end(), 1)
   299         // Possibly this can be done more efficiently by exploiting symmetry?
   300         for(int j = 0; j < decayInfo.resonances.size(); ++j) {
   301             if((!redoIntegral[i]) && (!redoIntegral[j]))
   304             fpcomplex dummy(0, 0);
   305             thrust::plus<fpcomplex> complexSum;
   306             (*(integrals[i][j])) = thrust::transform_reduce(
   307                 thrust::make_zip_iterator(thrust::make_tuple(binIndex, arrayAddress)),
   308                 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, arrayAddress)),
   309                 *(integrators[i][j]),
   315     // End of time-consuming integrals.
   316     fpcomplex sumIntegral(0, 0);
   318     for(unsigned int i = 0; i < decayInfo.resonances.size(); ++i) {
   319         int param_i = parameters + resonanceOffset_DP + resonanceSize * i;
   320         fpcomplex amplitude_i(host_params[host_indices[param_i]], host_params[host_indices[param_i + 1]]);
   322         for(unsigned int j = 0; j < decayInfo.resonances.size(); ++j) {
   323             int param_j = parameters + resonanceOffset_DP + resonanceSize * j;
   324             fpcomplex amplitude_j(host_params[host_indices[param_j]], -host_params[host_indices[param_j + 1]]);
   325             // Notice complex conjugation
   326             // printf("%f %f %f %f %f %f\n", amplitude_i.real(), amplitude_i.imag(), amplitude_j.real(),
   327             // amplitude_j.imag(), (*(integrals[i][j])).real(), (*(integrals[i][j])).imag() );
   328             sumIntegral += amplitude_i * amplitude_j * (*(integrals[i][j]));
   332     fptype ret           = sumIntegral.real(); // That complex number is a square, so it's fully real
   333     double binSizeFactor = 1;
   334     binSizeFactor *= _m12.getBinSize();
   335     binSizeFactor *= _m13.getBinSize();
   336     ret *= binSizeFactor;
   338     host_normalisation[parameters] = 1.0 / ret;
   339     // printf("%f %f\n", ret, binSizeFactor);
   343 __host__ fpcomplex DalitzPlotPdf::sumCachedWave(size_t i) const {
   344     const thrust::device_vector<fpcomplex> &vec = getCachedWaveNoCopy(i);
   346     fpcomplex ret = thrust::reduce(vec.begin(), vec.end(), fpcomplex(0, 0), thrust::plus<fpcomplex>());
   351 __host__ const std::vector<std::complex<fptype>> DalitzPlotPdf::getCachedWave(size_t i) const {
   352     auto ret_thrust = getCachedWave(i);
   353     std::vector<std::complex<fptype>> ret(ret_thrust.size());
   354     thrust::copy(ret_thrust.begin(), ret_thrust.end(), ret.begin());
   358 __host__ std::vector<std::vector<fptype>> DalitzPlotPdf::fit_fractions() {
   359     GOOFIT_DEBUG("Performing fit fraction calculation, should already have a cache (does not use normalization grid)");
   361     size_t n_res    = getDecayInfo().resonances.size();
   362     size_t nEntries = getCachedWaveNoCopy(0).size();
   364     std::vector<fpcomplex> coefs(n_res);
   365     std::transform(getDecayInfo().resonances.begin(),
   366                    getDecayInfo().resonances.end(),
   368                    [](ResonancePdf *res) { return fpcomplex(res->amp_real.getValue(), res->amp_imag.getValue()); });
   370     fptype buffer_all = 0;
   374     fpcomplex cached_i_val;
   375     fpcomplex cached_j_val;
   377     thrust::device_vector<fpcomplex> cached_i;
   378     thrust::device_vector<fpcomplex> cached_j;
   379     std::vector<std::vector<fptype>> Amps_int(n_res, std::vector<fptype>(n_res));
   381     for(size_t i = 0; i < n_res; i++) {
   382         for(size_t j = 0; j < n_res; j++) {
   384             cached_i = getCachedWaveNoCopy(i);
   385             cached_j = getCachedWaveNoCopy(j);
   389             buffer += thrust::transform_reduce(
   390                 thrust::make_zip_iterator(thrust::make_tuple(cached_i.begin(), cached_j.begin())),
   391                 thrust::make_zip_iterator(thrust::make_tuple(cached_i.end(), cached_j.end())),
   392                 CoefSumFunctor(coef_i, coef_j),
   394                 thrust::plus<fptype>());
   396             buffer_all += buffer;
   397             Amps_int[i][j] = (buffer / nEntries);
   401     fptype total_PDF = buffer_all / nEntries;
   403     std::vector<std::vector<fptype>> ff(n_res, std::vector<fptype>(n_res));
   405     for(size_t i = 0; i < n_res; i++)
   406         for(size_t j = 0; j < n_res; j++)
   407             ff[i][j] = (Amps_int[i][j] / total_PDF);
   412 SpecialResonanceIntegrator::SpecialResonanceIntegrator(int pIdx, unsigned int ri, unsigned int rj)
   415     , parameters(pIdx) {}
   417 __device__ fpcomplex SpecialResonanceIntegrator::operator()(thrust::tuple<int, fptype *> t) const {
   418     // Bin index, base address [lower, upper,getNumBins]
   419     // Notice that this is basically MetricTaker::operator (binned) with the special-case knowledge
   420     // that event size is two, and that the function to call is dev_DalitzPlot_calcIntegrals.
   422     int globalBinNumber  = thrust::get<0>(t);
   423     fptype lowerBoundM12 = thrust::get<1>(t)[0];
   424     fptype upperBoundM12 = thrust::get<1>(t)[1];
   425     auto numBinsM12      = static_cast<int>(floor(thrust::get<1>(t)[2] + 0.5));
   426     int binNumberM12     = globalBinNumber % numBinsM12;
   427     fptype binCenterM12  = upperBoundM12 - lowerBoundM12;
   428     binCenterM12 /= numBinsM12;
   429     binCenterM12 *= (binNumberM12 + 0.5);
   430     binCenterM12 += lowerBoundM12;
   432     globalBinNumber /= numBinsM12;
   433     fptype lowerBoundM13 = thrust::get<1>(t)[3];
   434     fptype upperBoundM13 = thrust::get<1>(t)[4];
   435     auto numBinsM13      = static_cast<int>(floor(thrust::get<1>(t)[5] + 0.5));
   436     fptype binCenterM13  = upperBoundM13 - lowerBoundM13;
   437     binCenterM13 /= numBinsM13;
   438     binCenterM13 *= (globalBinNumber + 0.5);
   439     binCenterM13 += lowerBoundM13;
   441     unsigned int *indices = paramIndices + parameters;
   443         = device_DalitzPlot_calcIntegrals(binCenterM12, binCenterM13, resonance_i, resonance_j, cudaArray, indices);
   445     fptype fakeEvt[10]; // Need room for many observables in case m12 or m13 were assigned a high index in an
   446                         // event-weighted fit.
   447     fakeEvt[indices[indices[0] + 2 + 0]] = binCenterM12;
   448     fakeEvt[indices[indices[0] + 2 + 1]] = binCenterM13;
   449     unsigned int numResonances           = indices[2];
   450     int effFunctionIdx                   = parIndexFromResIndex_DP(numResonances);
   451     fptype eff                           = callFunction(fakeEvt, indices[effFunctionIdx], indices[effFunctionIdx + 1]);
   453     // Multiplication by eff, not sqrt(eff), is correct:
   454     // These complex numbers will not be squared when they
   455     // go into the integrals. They've been squared already,
   458     // printf("ret %f %f %f %f %f\n",binCenterM12, binCenterM13, ret.real, ret.imag, eff );
   462 SpecialResonanceCalculator::SpecialResonanceCalculator(int pIdx, unsigned int res_idx)
   463     : resonance_i(res_idx)
   464     , parameters(pIdx) {}
   466 __device__ fpcomplex SpecialResonanceCalculator::operator()(thrust::tuple<int, fptype *, int> t) const {
   467     // Calculates the BW values for a specific resonance.
   469     int evtNum  = thrust::get<0>(t);
   470     fptype *evt = thrust::get<1>(t) + (evtNum * thrust::get<2>(t));
   472     unsigned int *indices = paramIndices + parameters; // Jump to DALITZPLOT position within parameters array
   473     fptype m12            = evt[indices[2 + indices[0]]];
   474     fptype m13            = evt[indices[3 + indices[0]]];
   476     fptype motherMass = functorConstants[indices[1] + 0];
   477     fptype daug1Mass  = functorConstants[indices[1] + 1];
   478     fptype daug2Mass  = functorConstants[indices[1] + 2];
   479     fptype daug3Mass  = functorConstants[indices[1] + 3];
   481     if(!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass))
   485         = motherMass * motherMass + daug1Mass * daug1Mass + daug2Mass * daug2Mass + daug3Mass * daug3Mass - m12 - m13;
   488         = parIndexFromResIndex_DP(resonance_i); // Find position of this resonance relative to DALITZPLOT start
   490     unsigned int functn_i = indices[parameter_i + 2];
   491     unsigned int params_i = indices[parameter_i + 3];
   493     ret = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
   494     // printf("Amplitude %f %f %f (%f, %f)\n ", m12, m13, m23, ret.real, ret.imag);
   498 } // namespace GooFit