GooFit  v2.1.3
ConvolutionPdf.cu
Go to the documentation of this file.
1 #include <goofit/Error.h>
2 #include <goofit/PDFs/combine/ConvolutionPdf.h>
3 #include <goofit/Variable.h>
4 
5 #include <thrust/iterator/constant_iterator.h>
6 #include <thrust/iterator/counting_iterator.h>
7 
8 namespace GooFit {
9 
10 int totalConvolutions = 0;
11 
12 #define CONVOLUTION_CACHE_SIZE 512
13 // I would really like this to be 1024, which is the maximum number of threads
14 // in a block (for compute capability 2.0 and up). Unfortunately this causes
15 // the program to hang, presumably because there isn't enough memory and something
16 // goes wrong. So... 512 should be enough for anyone, right?
17 
18 // Need multiple working spaces for the case of several convolutions in one PDF.
19 __constant__ fptype *dev_modWorkSpace[100];
20 __constant__ fptype *dev_resWorkSpace[100];
21 
22 // Number which transforms model range (x1, x2) into resolution range (x1 - maxX, x2 - minX).
23 // It is equal to the maximum possible value of x0, ie maxX, in bins.
24 __constant__ int modelOffset[100];
25 
26 __device__ fptype device_ConvolvePdfs(fptype *evt, fptype *p, unsigned int *indices) {
27  fptype ret = 0;
28  fptype loBound = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 0]);
29  fptype hiBound = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 1]);
30  fptype step = RO_CACHE(functorConstants[RO_CACHE(indices[5]) + 2]);
31  fptype x0 = evt[indices[2 + indices[0]]];
32  int workSpaceIndex = indices[6];
33 
34  auto numbins = static_cast<int>(floor((hiBound - loBound) / step + 0.5));
35 
36  fptype lowerBoundOffset = loBound / step;
37  lowerBoundOffset -= floor(lowerBoundOffset);
38  auto offsetInBins = static_cast<int>(floor(x0 / step - lowerBoundOffset));
39 
40  // Brute-force calculate integral M(x) * R(x - x0) dx
41  int offset = RO_CACHE(modelOffset[workSpaceIndex]);
42 
43  for(int i = 0; i < numbins; ++i) {
44  fptype model = RO_CACHE(dev_modWorkSpace[workSpaceIndex][i]);
45  fptype resol = RO_CACHE(dev_resWorkSpace[workSpaceIndex][i + offset - offsetInBins]);
46 
47  ret += model * resol;
48  }
49 
50  ret *= normalisationFactors[RO_CACHE(indices[2])];
51  ret *= normalisationFactors[RO_CACHE(indices[4])];
52 
53  return ret;
54 }
55 
56 __device__ fptype device_ConvolveSharedPdfs(fptype *evt, fptype *p, unsigned int *indices) {
57  fptype ret = 0;
58  fptype loBound = functorConstants[indices[5] + 0];
59  fptype hiBound = functorConstants[indices[5] + 1];
60  fptype step = functorConstants[indices[5] + 2];
61  fptype x0 = evt[indices[2 + indices[0]]];
62  unsigned int workSpaceIndex = indices[6];
63  unsigned int numOthers = indices[7] + 1; // +1 for this PDF.
64 
65  auto numbins = static_cast<int>(floor((hiBound - loBound) / step + 0.5));
66 
67  fptype lowerBoundOffset = loBound / step;
68  lowerBoundOffset -= floor(lowerBoundOffset);
69  auto offsetInBins = static_cast<int>(floor(x0 / step - lowerBoundOffset));
70 
71  // Brute-force calculate integral M(x) * R(x - x0) dx
72  __shared__ fptype modelCache[CONVOLUTION_CACHE_SIZE];
73  // Don't try to shared-load more items than we have threads.
74  int numToLoad
75  = thrust::minimum<unsigned int>()(CONVOLUTION_CACHE_SIZE / numOthers, static_cast<unsigned int>(BLOCKDIM));
76 
77  for(int i = 0; i < numbins; i += numToLoad) {
78  // This code avoids this problem: If event 0 is in workspace 0, and
79  // event 1 is in workspace 1, then threads 0 and 1 cannot fill the
80  // same modelCache, because they are drawing from different models!
81  // What's more, it's not guaranteed that the whole modelCache will
82  // be filled, because the threads can be all over the place in terms
83  // of their workspace.
84 
85  if(THREADIDX < numToLoad) {
86  for(unsigned int w = 0; w < numOthers; ++w) {
87  unsigned int wIndex = indices[8 + w];
88  modelCache[w * numToLoad + THREADIDX]
89  = (i + THREADIDX < numbins) ? dev_modWorkSpace[wIndex][i + THREADIDX] : 0;
90  }
91  }
92 
93  // This is slightly dangerous. If you have, for example,
94  // a MappedPdf in which one of the other functions
95  // is not a convolution, *you're gonna have a bad time*.
96  // It's unfortunately up to the user to ensure that every
97  // event will hit this point, by making sure that if *any*
98  // event has a convolution, *all* events do.
99  THREAD_SYNCH
100 
101  for(int j = 0; j < numToLoad; ++j) {
102  if(i + j >= numbins)
103  break;
104 
105  fptype model = modelCache[workSpaceIndex * numToLoad + j];
106  fptype resol = (model != 0)
107  ? dev_resWorkSpace[workSpaceIndex][i + j + modelOffset[workSpaceIndex] - offsetInBins]
108  : 0;
109  ret += model * resol;
110  }
111 
112  // If we don't sync at this point, some of the warps can *run ahead*
113  // and fill up their parts of modelCache with the *next* set of numbers,
114  // which means that the warps still working on the sum get the wrong
115  // numbers!
116  THREAD_SYNCH
117  }
118 
119  ret *= normalisationFactors[indices[2]];
120  ret *= normalisationFactors[indices[4]];
121 
122  return ret;
123 }
124 
125 __device__ device_function_ptr ptr_to_ConvolvePdfs = device_ConvolvePdfs;
126 __device__ device_function_ptr ptr_to_ConvolveSharedPdfs = device_ConvolveSharedPdfs;
127 
128 ConvolutionPdf::ConvolutionPdf(std::string n, Observable x, GooPdf *m, GooPdf *r)
129  : GooPdf(n, x)
130  , model(m)
131  , resolution(r)
132  , host_iConsts(nullptr)
133  , modelWorkSpace(nullptr)
134  , resolWorkSpace(nullptr)
135  , workSpaceIndex(0) {
136  // Constructor for convolution without cooperative
137  // loading of model cache. This is slow, but conceptually
138  // simple.
139  components.push_back(model);
140  components.push_back(resolution);
141 
142  // Indices stores (function index)(parameter index) doublet for model and resolution function.
143  std::vector<unsigned int> paramIndices;
144  paramIndices.push_back(model->getFunctionIndex());
145  paramIndices.push_back(model->getParameterIndex());
146  paramIndices.push_back(resolution->getFunctionIndex());
147  paramIndices.push_back(resolution->getParameterIndex());
148  paramIndices.push_back(registerConstants(3));
149  paramIndices.push_back(workSpaceIndex = totalConvolutions++);
150 
151  GET_FUNCTION_ADDR(ptr_to_ConvolvePdfs);
152  initialize(paramIndices);
153  setIntegrationConstants(-10, 10, 0.01);
154 }
155 
156 ConvolutionPdf::ConvolutionPdf(std::string n, Observable x, GooPdf *m, GooPdf *r, unsigned int numOthers)
157  : GooPdf(n, x)
158  , model(m)
159  , resolution(r)
160  , host_iConsts(nullptr)
161  , modelWorkSpace(nullptr)
162  , resolWorkSpace(nullptr)
163  , workSpaceIndex(0) {
164  // Constructor for the case when several convolutions
165  // can run in parallel; for example, a map where the targets
166  // are convolutions. (Make sure that *all* the targets
167  // are convolutions! Otherwise it will hang on the syncthreads
168  // call.) In this case the cooperative loading needs
169  // to initialize all the shared memory workspaces, and needs to know
170  // how many such workspaces there are, and which global workspaces
171  // to draw on. NB! To use cooperative loading in the case of a
172  // single function, just use numOthers = 0.
173 
174  components.push_back(model);
175  components.push_back(resolution);
176 
177  // Indices stores (function index)(parameter index) doublet for model and resolution function.
178  std::vector<unsigned int> paramIndices;
179  paramIndices.push_back(model->getFunctionIndex());
180  paramIndices.push_back(model->getParameterIndex());
181  paramIndices.push_back(resolution->getFunctionIndex());
182  paramIndices.push_back(resolution->getParameterIndex());
183  paramIndices.push_back(registerConstants(3));
184  paramIndices.push_back(workSpaceIndex = totalConvolutions++);
185  paramIndices.push_back(numOthers);
186 
187  if(0 == numOthers)
188  paramIndices.push_back(workSpaceIndex);
189  else {
190  properlyInitialised = false;
191 
192  for(unsigned int i = 0; i < numOthers + 1; ++i) { // Notice extra space for this PDF's index.
193  // Fill in later - must be done before setData call.
194  paramIndices.push_back(0);
195  }
196  }
197 
198  if(numOthers > CONVOLUTION_CACHE_SIZE)
199  throw GooFit::GeneralError(
200  "numOthers {} must be not be more than the cache size {}", numOthers, CONVOLUTION_CACHE_SIZE);
201 
202  GET_FUNCTION_ADDR(ptr_to_ConvolveSharedPdfs);
203  initialize(paramIndices);
204  setIntegrationConstants(-10, 10, 0.01);
205 }
206 
207 __host__ void ConvolutionPdf::setIntegrationConstants(fptype lo, fptype hi, fptype step) {
208  if(!host_iConsts) {
209  host_iConsts = new fptype[6];
210  gooMalloc(reinterpret_cast<void **>(&dev_iConsts), 6 * sizeof(fptype));
211  }
212 
213  host_iConsts[0] = lo;
214  host_iConsts[1] = hi;
215  host_iConsts[2] = step;
216  MEMCPY_TO_SYMBOL(
217  functorConstants, host_iConsts, 3 * sizeof(fptype), cIndex * sizeof(fptype), cudaMemcpyHostToDevice);
218 
219  if(modelWorkSpace) {
220  delete modelWorkSpace;
221  delete resolWorkSpace;
222  }
223 
224  auto numbins = static_cast<int>(floor((host_iConsts[1] - host_iConsts[0]) / step + 0.5));
225  // Different format for integration range!
226  modelWorkSpace = new thrust::device_vector<fptype>(numbins);
227 
228  // We will do integral from x1 to x2 of M(x)*R(x - x0) dx.
229  // So we need to cache the values of M from x1 to x2, which is given
230  // by the integration range. But R must be cached from x1-maxX to
231  // x2-minX, and the min and max are given by the dependent variable.
232  // However, the step must be the same as for the model, or the binning
233  // will get out of sync.
234  Observable dependent = *(observables.begin());
235 
236  host_iConsts[2] = numbins;
237  host_iConsts[3] = (host_iConsts[0] - dependent.getUpperLimit());
238  host_iConsts[4] = (host_iConsts[1] - dependent.getLowerLimit());
239 
240  numbins = static_cast<int>(floor((host_iConsts[4] - host_iConsts[3]) / step + 0.5));
241  host_iConsts[5] = numbins;
242  MEMCPY(dev_iConsts, host_iConsts, 6 * sizeof(fptype), cudaMemcpyHostToDevice);
243  resolWorkSpace = new thrust::device_vector<fptype>(numbins);
244 
245  int offset = dependent.getUpperLimit() / step;
246  MEMCPY_TO_SYMBOL(modelOffset, &offset, sizeof(int), workSpaceIndex * sizeof(int), cudaMemcpyHostToDevice);
247 
248  fptype *dev_address[1];
249  dev_address[0] = (&((*modelWorkSpace)[0])).get();
250  MEMCPY_TO_SYMBOL(
251  dev_modWorkSpace, dev_address, sizeof(fptype *), workSpaceIndex * sizeof(fptype *), cudaMemcpyHostToDevice);
252  dev_address[0] = (&((*resolWorkSpace)[0])).get();
253  MEMCPY_TO_SYMBOL(
254  dev_resWorkSpace, dev_address, sizeof(fptype *), workSpaceIndex * sizeof(fptype *), cudaMemcpyHostToDevice);
255 }
256 
257 __host__ void ConvolutionPdf::registerOthers(std::vector<ConvolutionPdf *> others) {
258  unsigned int numExpectedOthers = host_indices[parameters + 7] + 1;
259 
260  if(numExpectedOthers != others.size()) {
261  std::cout << "Problem: " << getName() << " initialized with " << others.size() << " other PDFs, expected "
262  << numExpectedOthers << " (including itself). Returning without initialization.\n";
263  return;
264  }
265 
266  bool foundSelf = false;
267 
268  for(unsigned int i = 0; i < others.size(); ++i) {
269  ConvolutionPdf *curr = others[i];
270 
271  if(curr == this)
272  foundSelf = true;
273 
274  host_indices[parameters + 8 + i] = curr->workSpaceIndex;
275  }
276 
277  if(!foundSelf) {
278  std::cout << "Problem: " << getName()
279  << " initialized with list that did not include itself. Returning without initialisation.\n";
280  return;
281  }
282 
283  properlyInitialised = true;
284 }
285 
286 __host__ fptype ConvolutionPdf::normalize() const {
287  // if (cpuDebug & 1) std::cout << getName() << " entering normalisation\n";
288 
289  // First set normalisation factors to one so we can evaluate convolution without getting zeroes
290  recursiveSetNormalisation(fptype(1.0));
291  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
292 
293  // Next recalculate functions at each point, in preparation for convolution integral
294  thrust::constant_iterator<fptype *> arrayAddress(dev_iConsts);
295  thrust::constant_iterator<int> eventSize(1);
296  thrust::counting_iterator<int> binIndex(0);
297 
298  if(model->parametersChanged()) {
299  // Calculate model function at every point in integration space
300  MetricTaker modalor(model, getMetricPointer("ptr_to_Eval"));
301  thrust::transform(
302  thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
303  thrust::make_zip_iterator(thrust::make_tuple(binIndex + modelWorkSpace->size(), eventSize, arrayAddress)),
304  modelWorkSpace->begin(),
305  modalor);
306  cudaDeviceSynchronize();
307  /*
308  if ((cpuDebug & 1) && (5 == workSpaceIndex)) {
309  thrust::host_vector<fptype> hModel(*modelWorkSpace);
310  std::cout << "Model: ";
311  for (unsigned int i = 0; i < hModel.size(); ++i)
312  std::cout << hModel[i] << " ";
313  std::cout << std::endl;
314  }
315  */
316  }
317 
318  if(resolution->parametersChanged()) {
319  // Same for resolution function.
320  thrust::constant_iterator<fptype *> arrayAddress2(dev_iConsts + 3);
321  MetricTaker resalor(resolution, getMetricPointer("ptr_to_Eval"));
322  thrust::transform(
323  thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress2)),
324  thrust::make_zip_iterator(thrust::make_tuple(binIndex + resolWorkSpace->size(), eventSize, arrayAddress2)),
325  resolWorkSpace->begin(),
326  resalor);
327  }
328 
329  // cudaDeviceSynchronize();
330 
331  // Then return usual integral
332  fptype ret = GooPdf::normalize();
333  return ret;
334 }
335 
336 } // namespace GooFit