1 #include <goofit/Error.h>
2 #include <goofit/PDFs/combine/ConvolutionPdf.h>
3 #include <goofit/Variable.h>
5 #include <thrust/iterator/constant_iterator.h>
6 #include <thrust/iterator/counting_iterator.h>
10 int totalConvolutions = 0;
12 #define CONVOLUTION_CACHE_SIZE 512
13 // I would really like this to be 1024, which is the maximum number of threads
14 // in a block (for compute capability 2.0 and up). Unfortunately this causes
15 // the program to hang, presumably because there isn't enough memory and something
16 // goes wrong. So... 512 should be enough for anyone, right?
18 // Need multiple working spaces for the case of several convolutions in one PDF.
19 __constant__ fptype *dev_modWorkSpace[100];
20 __constant__ fptype *dev_resWorkSpace[100];
22 // Number which transforms model range (x1, x2) into resolution range (x1 - maxX, x2 - minX).
23 // It is equal to the maximum possible value of x0, ie maxX, in bins.
24 __constant__ int modelOffset[100];
26 __device__ fptype device_ConvolvePdfs(fptype *evt, fptype *p, unsigned int *indices) {
28 fptype loBound = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 0]);
29 fptype hiBound = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 1]);
30 fptype step = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 2]);
31 fptype x0 = evt[indices[2 + indices[0]]];
32 int workSpaceIndex = indices[6];
34 auto numbins = static_cast<int>(floor((hiBound - loBound) / step + 0.5));
36 fptype lowerBoundOffset = loBound / step;
37 lowerBoundOffset -= floor(lowerBoundOffset);
38 auto offsetInBins = static_cast<int>(floor(x0 / step - lowerBoundOffset));
40 // Brute-force calculate integral M(x) * R(x - x0) dx
41 int offset = RO_CACHE(modelOffset[workSpaceIndex]);
43 for(int i = 0; i < numbins; ++i) {
44 fptype model = RO_CACHE(dev_modWorkSpace[workSpaceIndex][i]);
45 fptype resol = RO_CACHE(dev_resWorkSpace[workSpaceIndex][i + offset - offsetInBins]);
50 ret *= normalisationFactors[RO_CACHE(indices[2])];
51 ret *= normalisationFactors[RO_CACHE(indices[4])];
56 __device__ fptype device_ConvolveSharedPdfs(fptype *evt, fptype *p, unsigned int *indices) {
58 fptype loBound = functorConstants[indices[5] + 0];
59 fptype hiBound = functorConstants[indices[5] + 1];
60 fptype step = functorConstants[indices[5] + 2];
61 fptype x0 = evt[indices[2 + indices[0]]];
62 unsigned int workSpaceIndex = indices[6];
63 unsigned int numOthers = indices[7] + 1; // +1 for this PDF.
65 auto numbins = static_cast<int>(floor((hiBound - loBound) / step + 0.5));
67 fptype lowerBoundOffset = loBound / step;
68 lowerBoundOffset -= floor(lowerBoundOffset);
69 auto offsetInBins = static_cast<int>(floor(x0 / step - lowerBoundOffset));
71 // Brute-force calculate integral M(x) * R(x - x0) dx
72 __shared__ fptype modelCache[CONVOLUTION_CACHE_SIZE];
73 // Don't try to shared-load more items than we have threads.
75 = thrust::minimum<unsigned int>()(CONVOLUTION_CACHE_SIZE / numOthers, static_cast<unsigned int>(BLOCKDIM));
77 for(int i = 0; i < numbins; i += numToLoad) {
78 // This code avoids this problem: If event 0 is in workspace 0, and
79 // event 1 is in workspace 1, then threads 0 and 1 cannot fill the
80 // same modelCache, because they are drawing from different models!
81 // What's more, it's not guaranteed that the whole modelCache will
82 // be filled, because the threads can be all over the place in terms
83 // of their workspace.
85 if(THREADIDX < numToLoad) {
86 for(unsigned int w = 0; w < numOthers; ++w) {
87 unsigned int wIndex = indices[8 + w];
88 modelCache[w * numToLoad + THREADIDX]
89 = (i + THREADIDX < numbins) ? dev_modWorkSpace[wIndex][i + THREADIDX] : 0;
93 // This is slightly dangerous. If you have, for example,
94 // a MappedPdf in which one of the other functions
95 // is not a convolution, *you're gonna have a bad time*.
96 // It's unfortunately up to the user to ensure that every
97 // event will hit this point, by making sure that if *any*
98 // event has a convolution, *all* events do.
101 for(int j = 0; j < numToLoad; ++j) {
105 fptype model = modelCache[workSpaceIndex * numToLoad + j];
106 fptype resol = (model != 0)
107 ? dev_resWorkSpace[workSpaceIndex][i + j + modelOffset[workSpaceIndex] - offsetInBins]
109 ret += model * resol;
112 // If we don't sync at this point, some of the warps can *run ahead*
113 // and fill up their parts of modelCache with the *next* set of numbers,
114 // which means that the warps still working on the sum get the wrong
119 ret *= normalisationFactors[indices[2]];
120 ret *= normalisationFactors[indices[4]];
125 __device__ device_function_ptr ptr_to_ConvolvePdfs = device_ConvolvePdfs;
126 __device__ device_function_ptr ptr_to_ConvolveSharedPdfs = device_ConvolveSharedPdfs;
128 ConvolutionPdf::ConvolutionPdf(std::string n, Observable x, GooPdf *m, GooPdf *r)
132 , host_iConsts(nullptr)
133 , modelWorkSpace(nullptr)
134 , resolWorkSpace(nullptr)
135 , workSpaceIndex(0) {
136 // Constructor for convolution without cooperative
137 // loading of model cache. This is slow, but conceptually
139 components.push_back(model);
140 components.push_back(resolution);
142 // Indices stores (function index)(parameter index) doublet for model and resolution function.
143 std::vector<unsigned int> paramIndices;
144 paramIndices.push_back(model->getFunctionIndex());
145 paramIndices.push_back(model->getParameterIndex());
146 paramIndices.push_back(resolution->getFunctionIndex());
147 paramIndices.push_back(resolution->getParameterIndex());
148 paramIndices.push_back(registerConstants(3));
149 paramIndices.push_back(workSpaceIndex = totalConvolutions++);
151 GET_FUNCTION_ADDR(ptr_to_ConvolvePdfs);
152 initialize(paramIndices);
153 setIntegrationConstants(-10, 10, 0.01);
156 ConvolutionPdf::ConvolutionPdf(std::string n, Observable x, GooPdf *m, GooPdf *r, unsigned int numOthers)
160 , host_iConsts(nullptr)
161 , modelWorkSpace(nullptr)
162 , resolWorkSpace(nullptr)
163 , workSpaceIndex(0) {
164 // Constructor for the case when several convolutions
165 // can run in parallel; for example, a map where the targets
166 // are convolutions. (Make sure that *all* the targets
167 // are convolutions! Otherwise it will hang on the syncthreads
168 // call.) In this case the cooperative loading needs
169 // to initialize all the shared memory workspaces, and needs to know
170 // how many such workspaces there are, and which global workspaces
171 // to draw on. NB! To use cooperative loading in the case of a
172 // single function, just use numOthers = 0.
174 components.push_back(model);
175 components.push_back(resolution);
177 // Indices stores (function index)(parameter index) doublet for model and resolution function.
178 std::vector<unsigned int> paramIndices;
179 paramIndices.push_back(model->getFunctionIndex());
180 paramIndices.push_back(model->getParameterIndex());
181 paramIndices.push_back(resolution->getFunctionIndex());
182 paramIndices.push_back(resolution->getParameterIndex());
183 paramIndices.push_back(registerConstants(3));
184 paramIndices.push_back(workSpaceIndex = totalConvolutions++);
185 paramIndices.push_back(numOthers);
188 paramIndices.push_back(workSpaceIndex);
190 properlyInitialised = false;
192 for(unsigned int i = 0; i < numOthers + 1; ++i) { // Notice extra space for this PDF's index.
193 // Fill in later - must be done before setData call.
194 paramIndices.push_back(0);
198 if(numOthers > CONVOLUTION_CACHE_SIZE)
199 throw GooFit::GeneralError(
200 "numOthers {} must be not be more than the cache size {}", numOthers, CONVOLUTION_CACHE_SIZE);
202 GET_FUNCTION_ADDR(ptr_to_ConvolveSharedPdfs);
203 initialize(paramIndices);
204 setIntegrationConstants(-10, 10, 0.01);
207 __host__ void ConvolutionPdf::setIntegrationConstants(fptype lo, fptype hi, fptype step) {
209 host_iConsts = new fptype[6];
210 gooMalloc(reinterpret_cast<void **>(&dev_iConsts), 6 * sizeof(fptype));
213 host_iConsts[0] = lo;
214 host_iConsts[1] = hi;
215 host_iConsts[2] = step;
217 functorConstants, host_iConsts, 3 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
220 delete modelWorkSpace;
221 delete resolWorkSpace;
224 auto numbins = static_cast<int>(floor((host_iConsts[1] - host_iConsts[0]) / step + 0.5));
225 // Different format for integration range!
226 modelWorkSpace = new thrust::device_vector<fptype>(numbins);
228 // We will do integral from x1 to x2 of M(x)*R(x - x0) dx.
229 // So we need to cache the values of M from x1 to x2, which is given
230 // by the integration range. But R must be cached from x1-maxX to
231 // x2-minX, and the min and max are given by the dependent variable.
232 // However, the step must be the same as for the model, or the binning
233 // will get out of sync.
234 Observable dependent = *(observables.begin());
236 host_iConsts[2] = numbins;
237 host_iConsts[3] = (host_iConsts[0] - dependent.getUpperLimit());
238 host_iConsts[4] = (host_iConsts[1] - dependent.getLowerLimit());
240 numbins = static_cast<int>(floor((host_iConsts[4] - host_iConsts[3]) / step + 0.5));
241 host_iConsts[5] = numbins;
242 MEMCPY(dev_iConsts, host_iConsts, 6 * sizeof(fptype), cudaMemcpyHostToDevice);
243 resolWorkSpace = new thrust::device_vector<fptype>(numbins);
245 int offset = dependent.getUpperLimit() / step;
246 MEMCPY_TO_SYMBOL(modelOffset, &offset, sizeof(int), workSpaceIndex * sizeof(int), cudaMemcpyHostToDevice);
248 fptype *dev_address[1];
249 dev_address[0] = (&((*modelWorkSpace)[0])).get();
251 dev_modWorkSpace, dev_address, sizeof(fptype *), workSpaceIndex * sizeof(fptype *), cudaMemcpyHostToDevice);
252 dev_address[0] = (&((*resolWorkSpace)[0])).get();
254 dev_resWorkSpace, dev_address, sizeof(fptype *), workSpaceIndex * sizeof(fptype *), cudaMemcpyHostToDevice);
257 __host__ void ConvolutionPdf::registerOthers(std::vector<ConvolutionPdf *> others) {
258 unsigned int numExpectedOthers = host_indices[parameters + 7] + 1;
260 if(numExpectedOthers != others.size()) {
261 std::cout << "Problem: " << getName() << " initialized with " << others.size() << " other PDFs, expected "
262 << numExpectedOthers << " (including itself). Returning without initialization.\n";
266 bool foundSelf = false;
268 for(unsigned int i = 0; i < others.size(); ++i) {
269 ConvolutionPdf *curr = others[i];
274 host_indices[parameters + 8 + i] = curr->workSpaceIndex;
278 std::cout << "Problem: " << getName()
279 << " initialized with list that did not include itself. Returning without initialisation.\n";
283 properlyInitialised = true;
286 __host__ fptype ConvolutionPdf::normalize() const {
287 // if (cpuDebug & 1) std::cout << getName() << " entering normalisation\n";
289 // First set normalisation factors to one so we can evaluate convolution without getting zeroes
290 recursiveSetNormalisation(fptype(1.0));
291 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
293 // Next recalculate functions at each point, in preparation for convolution integral
294 thrust::constant_iterator<fptype *> arrayAddress(dev_iConsts);
295 thrust::constant_iterator<int> eventSize(1);
296 thrust::counting_iterator<int> binIndex(0);
298 if(model->parametersChanged()) {
299 // Calculate model function at every point in integration space
300 MetricTaker modalor(model, getMetricPointer("ptr_to_Eval"));
302 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
303 thrust::make_zip_iterator(thrust::make_tuple(binIndex + modelWorkSpace->size(), eventSize, arrayAddress)),
304 modelWorkSpace->begin(),
306 cudaDeviceSynchronize();
308 if ((cpuDebug & 1) && (5 == workSpaceIndex)) {
309 thrust::host_vector<fptype> hModel(*modelWorkSpace);
310 std::cout << "Model: ";
311 for (unsigned int i = 0; i < hModel.size(); ++i)
312 std::cout << hModel[i] << " ";
313 std::cout << std::endl;
318 if(resolution->parametersChanged()) {
319 // Same for resolution function.
320 thrust::constant_iterator<fptype *> arrayAddress2(dev_iConsts + 3);
321 MetricTaker resalor(resolution, getMetricPointer("ptr_to_Eval"));
323 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress2)),
324 thrust::make_zip_iterator(thrust::make_tuple(binIndex + resolWorkSpace->size(), eventSize, arrayAddress2)),
325 resolWorkSpace->begin(),
329 // cudaDeviceSynchronize();
331 // Then return usual integral
332 fptype ret = GooPdf::normalize();
336 } // namespace GooFit