1 #include <goofit/BinnedDataSet.h>
2 #include <goofit/Error.h>
3 #include <goofit/FitControl.h>
4 #include <goofit/GlobalCudaDefines.h>
5 #include <goofit/Log.h>
6 #include <goofit/PDFs/GooPdf.h>
7 #include <goofit/PdfBase.h>
15 // This is code that belongs to the PdfBase class, that is,
16 // it is common across all implementations. But it calls on device-side
17 // functions, and due to the nvcc translation-unit limitations, it cannot
18 // sit in its own object file; it must go in the CUDAglob.cu. So it's
19 // off on its own in this inline-cuda file, which GooPdf.cu
22 __host__ void PdfBase::copyParams(const std::vector<double> &pars) const {
23 // copyParams method performs eponymous action!
25 for(unsigned int i = 0; i < pars.size(); ++i) {
26 host_params[i] = pars[i];
28 if(std::isnan(host_params[i])) {
29 std::cout << " agh, parameter is NaN, die " << i << std::endl;
30 GooFit::abort(__FILE__, __LINE__, "NaN in parameter");
34 MEMCPY_TO_SYMBOL(cudaArray, host_params, pars.size() * sizeof(fptype), 0, cudaMemcpyHostToDevice);
37 __host__ void PdfBase::copyParams() {
38 // Copies values of Variable objects
39 std::vector<Variable> pars = getParameters();
40 std::vector<double> values;
42 for(const Variable &v : pars) {
43 int index = v.getIndex();
45 if(index >= static_cast<int>(values.size()))
46 values.resize(index + 1);
48 values[index] = v.getValue();
54 __host__ void PdfBase::copyNormFactors() const {
55 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
56 cudaDeviceSynchronize(); // Ensure normalisation integrals are finished
59 __host__ void PdfBase::initializeIndices(std::vector<unsigned int> pindices) {
60 // Structure of the individual index array: Number of parameters, then the indices
61 // requested by the subclass (which will be interpreted by the subclass kernel),
62 // then the number of observables, then the observable indices. Notice that the
63 // observable indices are not set until 'setIndices' is called, usually from setData;
64 // here we only reserve space for them by setting totalParams.
65 // This is to allow index sharing between PDFs - all the PDFs must be constructed
66 // before we know what observables exist.
68 GOOFIT_DEBUG("Adding space for {} indices for {}", pindices.size(), getName());
70 if(totalParams + pindices.size() >= maxParams)
71 throw GooFit::GeneralError(
72 "totalParams {} + pindices {} must be less than {}", totalParams, pindices.size(), maxParams);
73 host_indices[totalParams] = pindices.size();
75 for(int i = 1; i <= host_indices[totalParams]; ++i) {
76 GOOFIT_DEBUG("Setting host index {} to {}", totalParams + i, i - 1);
77 host_indices[totalParams + i] = pindices[i - 1];
81 "Setting host index {} to the num of observables, {}", totalParams + pindices.size() + 1, observables.size());
82 host_indices[totalParams + pindices.size() + 1] = observables.size();
84 parameters = totalParams;
85 totalParams += (2 + pindices.size() + observables.size());
86 GOOFIT_DEBUG("New total parameters: {}", totalParams);
88 if(totalParams >= maxParams)
89 throw GooFit::GeneralError("{}: Set too many parameters, GooFit array more than {}. Increase max at compile "
90 "time with -DGOOFIT_MAXPAR=N.",
94 MEMCPY_TO_SYMBOL(paramIndices, host_indices, totalParams * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
97 __host__ void PdfBase::recursiveSetIndices() {
98 for(auto &component : components) {
99 component->recursiveSetIndices();
102 int numParams = host_indices[parameters];
105 for(const Observable &v : observables) {
106 host_indices[parameters + 2 + numParams + counter] = v.getIndex();
107 GOOFIT_DEBUG("{} set index of {} to {} -> host {}",
111 parameters + 2 + numParams + counter)
118 __host__ void PdfBase::setIndices() {
121 for(Observable &v : observables) {
122 v.setIndex(counter++);
125 recursiveSetIndices();
126 MEMCPY_TO_SYMBOL(paramIndices, host_indices, totalParams * sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
128 // std::cout << "host_indices after " << getName() << " observable setIndices : ";
129 // for (int i = 0; i < totalParams; ++i) {
130 // std::cout << host_indices[i] << " ";
132 // std::cout << std::endl;
135 __host__ DataSet *PdfBase::getData() { return data_; }
137 __host__ void PdfBase::setData(DataSet *data) {
138 if(dev_event_array) {
139 gooFree(dev_event_array);
140 cudaDeviceSynchronize();
141 dev_event_array = nullptr;
142 m_iEventsPerTask = 0;
147 // Do nothing if passed a nullptr (makes setData(getData()) safe)
153 UnbinnedDataSet *unbinned_data;
154 BinnedDataSet *binned_data;
156 if((unbinned_data = dynamic_cast<UnbinnedDataSet *>(data))) {
157 numEntries = data->getNumEvents();
158 numEvents = numEntries;
160 size_t dimensions = observables.size();
163 // This fetches our rank and the total number of processes in the MPI call
165 MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
166 MPI_Comm_rank(MPI_COMM_WORLD, &myId);
168 int perTask = numEvents / numProcs;
170 // This will track for a given rank where they will start and how far they will go
171 int *counts = new int[numProcs];
172 int *displacements = new int[numProcs];
174 for(int i = 0; i < numProcs - 1; i++)
177 counts[numProcs - 1] = numEntries - perTask * (numProcs - 1);
179 displacements[0] = 0;
181 for(int i = 1; i < numProcs; i++)
182 displacements[i] = displacements[i - 1] + counts[i - 1];
186 auto *host_array = new fptype[numEntries * dimensions];
189 // This is an array to track if we need to re-index the observable
190 int fixme[observables.size()];
191 memset(fixme, 0, sizeof(int) * observables.size());
193 for(int i = 0; i < observables.size(); i++) {
194 // We are casting the observable to a CountVariable
195 CountingVariable *c = dynamic_cast<CountingVariable *>(observables[i]);
197 // if it is true re-index
204 // Transfer into our whole buffer
205 for(int i = 0; i < numEntries; ++i) {
206 for(const Observable &v : observables) {
207 fptype currVal = unbinned_data->getValue(v, i);
208 host_array[i * dimensions + v.getIndex()] = currVal;
214 // We will go through all of the events and re-index if appropriate
215 for(int i = 1; i < numProcs; i++) {
216 for(int j = 0; j < counts[i]; j++) {
217 for(int k = 0; k < dimensions; k++) {
219 host_array[(j + displacements[i]) * dimensions + k] = float(j);
224 int mystart = displacements[myId];
225 int myend = mystart + counts[myId];
226 int mycount = myend - mystart;
228 gooMalloc((void **)&dev_event_array, dimensions * mycount * sizeof(fptype));
229 MEMCPY(dev_event_array,
230 host_array + mystart * dimensions,
231 dimensions * mycount * sizeof(fptype),
232 cudaMemcpyHostToDevice);
233 MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
236 setNumPerTask(this, mycount);
239 delete[] displacements;
241 gooMalloc(reinterpret_cast<void **>(&dev_event_array), dimensions * numEntries * sizeof(fptype));
242 MEMCPY(dev_event_array, host_array, dimensions * numEntries * sizeof(fptype), cudaMemcpyHostToDevice);
243 MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
246 } else if((binned_data = dynamic_cast<BinnedDataSet *>(data))) {
248 numEntries = binned_data->getNumBins();
249 int dimensions = 2 + observables.size(); // Bin center (x,y, ...), bin value, and bin volume.
251 if(!fitControl->binnedFit())
252 setFitControl(std::make_shared<BinnedNllFit>());
255 // This fetches our rank and the total number of processes in the MPI call
257 MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
258 MPI_Comm_rank(MPI_COMM_WORLD, &myId);
260 int perTask = numEvents / numProcs;
262 // This will track for a given rank where they will start and how far they will go
263 int *counts = new int[numProcs];
264 int *displacements = new int[numProcs];
266 for(int i = 0; i < numProcs - 1; i++)
269 counts[numProcs - 1] = numEntries - perTask * (numProcs - 1);
271 displacements[0] = 0;
273 for(int i = 1; i < numProcs; i++)
274 displacements[i] = displacements[i - 1] + counts[i - 1];
278 auto *host_array = new fptype[numEntries * dimensions];
281 // This is an array to track if we need to re-index the observable
282 int fixme[observables.size()];
283 memset(fixme, 0, sizeof(int) * observables.size());
285 for(int i = 0; i < observables.size(); i++) {
286 // We are casting the observable to a CountVariable
287 CountingVariable *c = dynamic_cast<CountingVariable *>(observables[i]);
289 // if it is true re-index
296 for(unsigned int i = 0; i < numEntries; ++i) {
297 for(const Observable &v : observables) {
298 host_array[i * dimensions + v.getIndex()] = binned_data->getBinCenter(v, i);
301 host_array[i * dimensions + observables.size() + 0] = binned_data->getBinContent(i);
302 host_array[i * dimensions + observables.size() + 1]
303 = fitControl->binErrors() ? binned_data->getBinError(i) : binned_data->getBinVolume(i);
304 numEvents += binned_data->getBinContent(i);
309 // We will go through all of the events and re-index if appropriate
310 for(int i = 1; i < numProcs; i++) {
311 for(int j = 0; j < counts[j]; j++) {
312 for(int k = 0; k < dimensions; k++) {
314 host_array[(j + displacements[i]) * dimensions + k] = float(j);
319 int mystart = displacements[myId];
320 int myend = mystart + counts[myId];
321 int mycount = myend - mystart;
323 gooMalloc((void **)&dev_event_array, dimensions * mycount * sizeof(fptype));
324 MEMCPY(dev_event_array,
325 host_array + mystart * dimensions,
326 dimensions * mycount * sizeof(fptype),
327 cudaMemcpyHostToDevice);
328 MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
331 setNumPerTask(this, mycount);
334 delete[] displacements;
336 gooMalloc(reinterpret_cast<void **>(&dev_event_array), dimensions * numEntries * sizeof(fptype));
337 MEMCPY(dev_event_array, host_array, dimensions * numEntries * sizeof(fptype), cudaMemcpyHostToDevice);
338 MEMCPY_TO_SYMBOL(functorConstants, &numEvents, sizeof(fptype), 0, cudaMemcpyHostToDevice);
342 throw GooFit::GeneralError("Dataset must be binned or unbinned!");
345 __host__ void PdfBase::generateNormRange() {
349 gooMalloc(reinterpret_cast<void **>(&normRanges), 3 * observables.size() * sizeof(fptype));
351 auto *host_norms = new fptype[3 * observables.size()];
352 int counter = 0; // Don't use index in this case to allow for, eg,
354 // a single observable whose index is 1; or two observables with indices
355 // 0 and 2. Make one array per functor, as opposed to variable, to make
356 // it easy to pass MetricTaker a range without worrying about which parts
358 for(Observable &v : observables) {
359 host_norms[3 * counter + 0] = v.getLowerLimit();
360 host_norms[3 * counter + 1] = v.getUpperLimit();
361 host_norms[3 * counter + 2] = integrationBins > 0 ? integrationBins : v.getNumBins();
365 MEMCPY(normRanges, host_norms, 3 * observables.size() * sizeof(fptype), cudaMemcpyHostToDevice);
369 void PdfBase::clearCurrentFit() {
371 gooFree(dev_event_array);
372 dev_event_array = nullptr;
375 __host__ void PdfBase::printProfileInfo(bool topLevel) {
379 cudaError_t err = MEMCPY_FROM_SYMBOL(host_timeHist, timeHistogram, 10000 * sizeof(fptype), 0);
381 if(cudaSuccess != err) {
382 std::cout << "Error on copying timeHistogram: " << cudaGetErrorString(err) << std::endl;
386 std::cout << getName() << " : " << getFunctionIndex() << " "
387 << host_timeHist[100 * getFunctionIndex() + getParameterIndex()] << std::endl;
389 for(unsigned int i = 0; i < components.size(); ++i) {
390 components[i]->printProfileInfo(false);
397 cudaError_t gooMalloc(void **target, size_t bytes) {
398 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
399 target[0] = malloc(bytes);
404 return cudaErrorMemoryAllocation;
407 return cudaMalloc(target, bytes);
411 cudaError_t gooFree(void *ptr) {
412 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
416 return cudaFree(ptr);
419 } // namespace GooFit