GooFit  v2.1.3
PdfBase.cu
Go to the documentation of this file.
1 #include <goofit/BinnedDataSet.h>
2 #include <goofit/Error.h>
3 #include <goofit/FitControl.h>
4 #include <goofit/GlobalCudaDefines.h>
5 #include <goofit/Log.h>
6 #include <goofit/PDFs/GooPdf.h>
7 #include <goofit/PdfBase.h>
8 
9 #ifdef GOOFIT_MPI
10 #include <mpi.h>
11 #endif
12 
13 namespace GooFit {
14 
15 // This is code that belongs to the PdfBase class, that is,
16 // it is common across all implementations. But it calls on device-side
17 // functions, and due to the nvcc translation-unit limitations, it cannot
18 // sit in its own object file; it must go in the CUDAglob.cu. So it's
19 // off on its own in this inline-cuda file, which GooPdf.cu
20 // should include.
21 
22 __host__ void PdfBase::copyParams(const std::vector<double> &pars) const {
23  // copyParams method performs eponymous action!
24 
25  for(unsigned int i = 0; i < pars.size(); ++i) {
26  host_params[i] = pars[i];
27 
28  if(std::isnan(host_params[i])) {
29  std::cout << " agh, parameter is NaN, die " << i << std::endl;
30  GooFit::abort(__FILE__, __LINE__, "NaN in parameter");
31  }
32  }
33 
34  MEMCPY_TO_SYMBOL(cudaArray, host_params, pars.size() * sizeof(fptype), 0, cudaMemcpyHostToDevice);
35 }
36 
37 __host__ void PdfBase::copyParams() {
38  // Copies values of Variable objects
39  std::vector<Variable> pars = getParameters();
40  std::vector<double> values;
41 
42  for(const Variable &v : pars) {
43  int index = v.getIndex();
44 
45  if(index >= static_cast<int>(values.size()))
46  values.resize(index + 1);
47 
48  values[index] = v.getValue();
49  }
50 
51  copyParams(values);
52 }
53 
54 __host__ void PdfBase::copyNormFactors() const {
55  MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
56  cudaDeviceSynchronize(); // Ensure normalisation integrals are finished
57 }
58 
59 __host__ void PdfBase::initializeIndices(std::vector<unsigned int> pindices) {
60  // Structure of the individual index array: Number of parameters, then the indices
61  // requested by the subclass (which will be interpreted by the subclass kernel),
62  // then the number of observables, then the observable indices. Notice that the
63  // observable indices are not set until 'setIndices' is called, usually from setData;
64  // here we only reserve space for them by setting totalParams.
65  // This is to allow index sharing between PDFs - all the PDFs must be constructed
66  // before we know what observables exist.
67 
68  GOOFIT_DEBUG("Adding space for {} indices for {}", pindices.size(), getName());
69 
70  if(totalParams + pindices.size() >= maxParams)
71  throw GooFit::GeneralError(
72  "totalParams {} + pindices {} must be less than {}", totalParams, pindices.size(), maxParams);
73  host_indices[totalParams] = pindices.size();
74 
75  for(int i = 1; i <= host_indices[totalParams]; ++i) {
76  GOOFIT_DEBUG("Setting host index {} to {}", totalParams + i, i - 1);
77  host_indices[totalParams + i] = pindices[i - 1];
78  }
79 
80  GOOFIT_DEBUG(
81  "Setting host index {} to the num of observables, {}", totalParams + pindices.size() + 1, observables.size());
82  host_indices[totalParams + pindices.size() + 1] = observables.size();
83 
84  parameters = totalParams;
85  totalParams += (2 + pindices.size() + observables.size());
86  GOOFIT_DEBUG("New total parameters: {}", totalParams);
87 
88  if(totalParams >= maxParams)
89  throw GooFit::GeneralError("{}: Set too many parameters, GooFit array more than {}. Increase max at compile "
90  "time with -DGOOFIT_MAXPAR=N.",
91  getName(),
92  maxParams);
93 
94  MEMCPY_TO_SYMBOL(paramIndices, host_indices, totalParams * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
95 }
96 
97 __host__ void PdfBase::recursiveSetIndices() {
98  for(auto &component : components) {
99  component->recursiveSetIndices();
100  }
101 
102  int numParams = host_indices[parameters];
103  int counter = 0;
104 
105  for(const Observable &v : observables) {
106  host_indices[parameters + 2 + numParams + counter] = v.getIndex();
107  GOOFIT_DEBUG("{} set index of {} to {} -> host {}",
108  getName(),
109  v.getName(),
110  v.getIndex(),
111  parameters + 2 + numParams + counter)
112  counter++;
113  }
114 
115  generateNormRange();
116 }
117 
118 __host__ void PdfBase::setIndices() {
119  int counter = 0;
120 
121  for(Observable &v : observables) {
122  v.setIndex(counter++);
123  }
124 
125  recursiveSetIndices();
126  MEMCPY_TO_SYMBOL(paramIndices, host_indices, totalParams * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
127 
128  // std::cout << "host_indices after " << getName() << " observable setIndices : ";
129  // for (int i = 0; i < totalParams; ++i) {
130  // std::cout << host_indices[i] << " ";
131  //}
132  // std::cout << std::endl;
133 }
134 
135 __host__ DataSet *PdfBase::getData() { return data_; }
136 
137 __host__ void PdfBase::setData(DataSet *data) {
138  if(dev_event_array) {
139  gooFree(dev_event_array);
140  cudaDeviceSynchronize();
141  dev_event_array = nullptr;
142  m_iEventsPerTask = 0;
143  }
144 
145  data_ = data;
146 
147  // Do nothing if passed a nullptr (makes setData(getData()) safe)
148  if(data == nullptr)
149  return;
150 
151  setIndices();
152 
153  UnbinnedDataSet *unbinned_data;
154  BinnedDataSet *binned_data;
155 
156  if((unbinned_data = dynamic_cast<UnbinnedDataSet *>(data))) {
157  numEntries = data->getNumEvents();
158  numEvents = numEntries;
159 
160  size_t dimensions = observables.size();
161 
162 #ifdef GOOFIT_MPI
163  // This fetches our rank and the total number of processes in the MPI call
164  int myId, numProcs;
165  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
166  MPI_Comm_rank(MPI_COMM_WORLD, &myId);
167 
168  int perTask = numEvents / numProcs;
169 
170  // This will track for a given rank where they will start and how far they will go
171  int *counts = new int[numProcs];
172  int *displacements = new int[numProcs];
173 
174  for(int i = 0; i < numProcs - 1; i++)
175  counts[i] = perTask;
176 
177  counts[numProcs - 1] = numEntries - perTask * (numProcs - 1);
178 
179  displacements[0] = 0;
180 
181  for(int i = 1; i < numProcs; i++)
182  displacements[i] = displacements[i - 1] + counts[i - 1];
183 
184 #endif
185 
186  auto *host_array = new fptype[numEntries * dimensions];
187 
188 #ifdef GOOFIT_MPI
189  // This is an array to track if we need to re-index the observable
190  int fixme[observables.size()];
191  memset(fixme, 0, sizeof(int) * observables.size());
192 
193  for(int i = 0; i < observables.size(); i++) {
194  // We are casting the observable to a CountVariable
195  CountingVariable *c = dynamic_cast<CountingVariable *>(observables[i]);
196 
197  // if it is true re-index
198  if(c)
199  fixme[i] = 1;
200  }
201 
202 #endif
203 
204  // Transfer into our whole buffer
205  for(int i = 0; i < numEntries; ++i) {
206  for(const Observable &v : observables) {
207  fptype currVal = unbinned_data->getValue(v, i);
208  host_array[i * dimensions + v.getIndex()] = currVal;
209  }
210  }
211 
212 #ifdef GOOFIT_MPI
213 
214  // We will go through all of the events and re-index if appropriate
215  for(int i = 1; i < numProcs; i++) {
216  for(int j = 0; j < counts[i]; j++) {
217  for(int k = 0; k < dimensions; k++) {
218  if(fixme[k] > 0)
219  host_array[(j + displacements[i]) * dimensions + k] = float(j);
220  }
221  }
222  }
223 
224  int mystart = displacements[myId];
225  int myend = mystart + counts[myId];
226  int mycount = myend - mystart;
227 
228  gooMalloc((void **)&dev_event_array, dimensions * mycount * sizeof(fptype));
229  MEMCPY(dev_event_array,
230  host_array + mystart * dimensions,
231  dimensions * mycount * sizeof(fptype),
232  cudaMemcpyHostToDevice);
233  MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
234  delete[] host_array;
235 
236  setNumPerTask(this, mycount);
237 
238  delete[] counts;
239  delete[] displacements;
240 #else
241  gooMalloc(reinterpret_cast<void **>(&dev_event_array), dimensions * numEntries * sizeof(fptype));
242  MEMCPY(dev_event_array, host_array, dimensions * numEntries * sizeof(fptype), cudaMemcpyHostToDevice);
243  MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
244  delete[] host_array;
245 #endif
246  } else if((binned_data = dynamic_cast<BinnedDataSet *>(data))) {
247  numEvents = 0;
248  numEntries = binned_data->getNumBins();
249  int dimensions = 2 + observables.size(); // Bin center (x,y, ...), bin value, and bin volume.
250 
251  if(!fitControl->binnedFit())
252  setFitControl(std::make_shared<BinnedNllFit>());
253 
254 #ifdef GOOFIT_MPI
255  // This fetches our rank and the total number of processes in the MPI call
256  int myId, numProcs;
257  MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
258  MPI_Comm_rank(MPI_COMM_WORLD, &myId);
259 
260  int perTask = numEvents / numProcs;
261 
262  // This will track for a given rank where they will start and how far they will go
263  int *counts = new int[numProcs];
264  int *displacements = new int[numProcs];
265 
266  for(int i = 0; i < numProcs - 1; i++)
267  counts[i] = perTask;
268 
269  counts[numProcs - 1] = numEntries - perTask * (numProcs - 1);
270 
271  displacements[0] = 0;
272 
273  for(int i = 1; i < numProcs; i++)
274  displacements[i] = displacements[i - 1] + counts[i - 1];
275 
276 #endif
277 
278  auto *host_array = new fptype[numEntries * dimensions];
279 
280 #ifdef GOOFIT_MPI
281  // This is an array to track if we need to re-index the observable
282  int fixme[observables.size()];
283  memset(fixme, 0, sizeof(int) * observables.size());
284 
285  for(int i = 0; i < observables.size(); i++) {
286  // We are casting the observable to a CountVariable
287  CountingVariable *c = dynamic_cast<CountingVariable *>(observables[i]);
288 
289  // if it is true re-index
290  if(c)
291  fixme[i] = 1;
292  }
293 
294 #endif
295 
296  for(unsigned int i = 0; i < numEntries; ++i) {
297  for(const Observable &v : observables) {
298  host_array[i * dimensions + v.getIndex()] = binned_data->getBinCenter(v, i);
299  }
300 
301  host_array[i * dimensions + observables.size() + 0] = binned_data->getBinContent(i);
302  host_array[i * dimensions + observables.size() + 1]
303  = fitControl->binErrors() ? binned_data->getBinError(i) : binned_data->getBinVolume(i);
304  numEvents += binned_data->getBinContent(i);
305  }
306 
307 #ifdef GOOFIT_MPI
308 
309  // We will go through all of the events and re-index if appropriate
310  for(int i = 1; i < numProcs; i++) {
311  for(int j = 0; j < counts[j]; j++) {
312  for(int k = 0; k < dimensions; k++) {
313  if(fixme[k] > 0)
314  host_array[(j + displacements[i]) * dimensions + k] = float(j);
315  }
316  }
317  }
318 
319  int mystart = displacements[myId];
320  int myend = mystart + counts[myId];
321  int mycount = myend - mystart;
322 
323  gooMalloc((void **)&dev_event_array, dimensions * mycount * sizeof(fptype));
324  MEMCPY(dev_event_array,
325  host_array + mystart * dimensions,
326  dimensions * mycount * sizeof(fptype),
327  cudaMemcpyHostToDevice);
328  MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
329  delete[] host_array;
330 
331  setNumPerTask(this, mycount);
332 
333  delete[] counts;
334  delete[] displacements;
335 #else
336  gooMalloc(reinterpret_cast<void **>(&dev_event_array), dimensions * numEntries * sizeof(fptype));
337  MEMCPY(dev_event_array, host_array, dimensions * numEntries * sizeof(fptype), cudaMemcpyHostToDevice);
338  MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
339  delete[] host_array;
340 #endif
341  } else
342  throw GooFit::GeneralError("Dataset must be binned or unbinned!");
343 }
344 
345 __host__ void PdfBase::generateNormRange() {
346  if(normRanges)
347  gooFree(normRanges);
348 
349  gooMalloc(reinterpret_cast<void **>(&normRanges), 3 * observables.size() * sizeof(fptype));
350 
351  auto *host_norms = new fptype[3 * observables.size()];
352  int counter = 0; // Don't use index in this case to allow for, eg,
353 
354  // a single observable whose index is 1; or two observables with indices
355  // 0 and 2. Make one array per functor, as opposed to variable, to make
356  // it easy to pass MetricTaker a range without worrying about which parts
357  // to use.
358  for(Observable &v : observables) {
359  host_norms[3 * counter + 0] = v.getLowerLimit();
360  host_norms[3 * counter + 1] = v.getUpperLimit();
361  host_norms[3 * counter + 2] = integrationBins > 0 ? integrationBins : v.getNumBins();
362  counter++;
363  }
364 
365  MEMCPY(normRanges, host_norms, 3 * observables.size() * sizeof(fptype), cudaMemcpyHostToDevice);
366  delete[] host_norms;
367 }
368 
369 void PdfBase::clearCurrentFit() {
370  totalParams = 0;
371  gooFree(dev_event_array);
372  dev_event_array = nullptr;
373 }
374 
375 __host__ void PdfBase::printProfileInfo(bool topLevel) {
376 #ifdef PROFILING
377 
378  if(topLevel) {
379  cudaError_t err = MEMCPY_FROM_SYMBOL(host_timeHist, timeHistogram, 10000 * sizeof(fptype), 0);
380 
381  if(cudaSuccess != err) {
382  std::cout << "Error on copying timeHistogram: " << cudaGetErrorString(err) << std::endl;
383  return;
384  }
385 
386  std::cout << getName() << " : " << getFunctionIndex() << " "
387  << host_timeHist[100 * getFunctionIndex() + getParameterIndex()] << std::endl;
388 
389  for(unsigned int i = 0; i < components.size(); ++i) {
390  components[i]->printProfileInfo(false);
391  }
392  }
393 
394 #endif
395 }
396 
397 cudaError_t gooMalloc(void **target, size_t bytes) {
398 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
399  target[0] = malloc(bytes);
400 
401  if(target[0])
402  return cudaSuccess;
403  else
404  return cudaErrorMemoryAllocation;
405 
406 #else
407  return cudaMalloc(target, bytes);
408 #endif
409 }
410 
411 cudaError_t gooFree(void *ptr) {
412 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
413  free(ptr);
414  return cudaSuccess;
415 #else
416  return cudaFree(ptr);
417 #endif
418 }
419 } // namespace GooFit