1 #include <goofit/GlobalCudaDefines.h>
2 #include <goofit/PDFs/GooPdf.h>
3 #include <goofit/detail/ThrustOverride.h>
5 #include <goofit/BinnedDataSet.h>
6 #include <goofit/Error.h>
7 #include <goofit/FitControl.h>
8 #include <goofit/Log.h>
9 #include <goofit/UnbinnedDataSet.h>
10 #include <goofit/Variable.h>
12 #include <thrust/device_vector.h>
13 #include <thrust/host_vector.h>
14 #include <thrust/iterator/constant_iterator.h>
15 #include <thrust/iterator/zip_iterator.h>
16 #include <thrust/sequence.h>
17 #include <thrust/transform.h>
18 #include <thrust/transform_reduce.h>
30 // These variables are either function-pointer related (thus specific to this implementation)
31 // or constrained to be in the CUDAglob translation unit by nvcc limitations; otherwise they
32 // would be in PdfBase.
34 // Device-side, translation-unit constrained.
36 __constant__ fptype cudaArray[maxParams];
37 __constant__ unsigned int paramIndices[maxParams];
38 __constant__ fptype functorConstants[maxParams];
39 __constant__ fptype normalisationFactors[maxParams];
43 __constant__ int callnumber;
44 __constant__ int gpuDebug;
45 __constant__ unsigned int debugParamIndex;
46 __device__ int internalDebug1 = -1;
47 __device__ int internalDebug2 = -1;
48 __device__ int internalDebug3 = -1;
52 __device__ fptype timeHistogram[10000];
53 fptype host_timeHist[10000];
56 // Function-pointer related.
57 __device__ void *device_function_table[200];
58 // Not clear why this cannot be __constant__, but it causes crashes to declare it so.
60 void *host_function_table[200];
61 unsigned int num_device_functions = 0;
62 std::map<void *, int> functionAddressToDeviceIndexMap;
64 // For use in debugging memory issues
65 void printMemoryStatus(std::string file, int line) {
68 cudaDeviceSynchronize();
70 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
71 cudaMemGetInfo(&memfree, &memtotal);
73 cudaDeviceSynchronize();
74 std::cout << "Memory status " << file << " " << line << " Free " << memfree << " Total " << memtotal << " Used "
75 << (memtotal - memfree) << std::endl;
78 __device__ fptype calculateEval(fptype rawPdf, fptype *evtVal, unsigned int par) {
79 // Just return the raw PDF value, for use in (eg) normalisation.
83 __device__ fptype calculateNLL(fptype rawPdf, fptype *evtVal, unsigned int par) {
84 // if ((10 > callnumber) && (THREADIDX < 10) && (BLOCKIDX == 0)) cuPrintf("calculateNll %i %f %f %f\n", callnumber,
85 // rawPdf, normalisationFactors[par], rawPdf*normalisationFactors[par]);
86 // if (THREADIDX < 50) printf("Thread %i %f %f\n", THREADIDX, rawPdf, normalisationFactors[par]);
87 rawPdf *= normalisationFactors[par];
88 return rawPdf > 0 ? -log(rawPdf) : 0;
91 __device__ fptype calculateProb(fptype rawPdf, fptype *evtVal, unsigned int par) {
92 // Return probability, ie normalized PDF value.
93 return rawPdf * normalisationFactors[par];
96 __device__ fptype calculateBinAvg(fptype rawPdf, fptype *evtVal, unsigned int par) {
97 rawPdf *= normalisationFactors[par];
98 rawPdf *= evtVal[1]; // Bin volume
100 // Log-likelihood of numEvents with expectation of exp is (-exp + numEvents*ln(exp) - ln(numEvents!)).
101 // The last is constant, so we drop it; and then multiply by minus one to get the negative log-likelihood.
103 fptype expEvents = functorConstants[0] * rawPdf;
104 return (expEvents - evtVal[0] * log(expEvents));
110 __device__ fptype calculateBinWithError(fptype rawPdf, fptype *evtVal, unsigned int par) {
111 // In this case interpret the rawPdf as just a number, not a number of events.
112 // Do not divide by integral over phase space, do not multiply by bin volume,
113 // and do not collect 200 dollars. evtVal should have the structure (bin entry, bin error).
114 // printf("[%i, %i] ((%f - %f) / %f)^2 = %f\n", BLOCKIDX, THREADIDX, rawPdf, evtVal[0], evtVal[1], pow((rawPdf -
115 // evtVal[0]) / evtVal[1], 2));
116 rawPdf -= evtVal[0]; // Subtract observed value.
117 rawPdf /= evtVal[1]; // Divide by error.
122 __device__ fptype calculateChisq(fptype rawPdf, fptype *evtVal, unsigned int par) {
123 rawPdf *= normalisationFactors[par];
124 rawPdf *= evtVal[1]; // Bin volume
126 return POW2(rawPdf * functorConstants[0] - evtVal[0]) / (evtVal[0] > 1 ? evtVal[0] : 1);
129 __device__ device_metric_ptr ptr_to_Eval = calculateEval;
130 __device__ device_metric_ptr ptr_to_NLL = calculateNLL;
131 __device__ device_metric_ptr ptr_to_Prob = calculateProb;
132 __device__ device_metric_ptr ptr_to_BinAvg = calculateBinAvg;
133 __device__ device_metric_ptr ptr_to_BinWithError = calculateBinWithError;
134 __device__ device_metric_ptr ptr_to_Chisq = calculateChisq;
136 void *host_fcn_ptr = nullptr;
138 void *getMetricPointer(std::string name) {
139 #define CHOOSE_PTR(ptrname) \
140 if(name == #ptrname) \
141 GET_FUNCTION_ADDR(ptrname);
142 host_fcn_ptr = nullptr;
143 CHOOSE_PTR(ptr_to_Eval);
144 CHOOSE_PTR(ptr_to_NLL);
145 CHOOSE_PTR(ptr_to_Prob);
146 CHOOSE_PTR(ptr_to_BinAvg);
147 CHOOSE_PTR(ptr_to_BinWithError);
148 CHOOSE_PTR(ptr_to_Chisq);
150 if(host_fcn_ptr == nullptr)
151 throw GooFit::GeneralError("host_fcn_ptr is nullptr");
157 void *getMetricPointer(EvalFunc val) { return getMetricPointer(evalfunc_to_string(val)); }
159 __host__ int GooPdf::findFunctionIdx(void *dev_functionPtr) {
160 // Code specific to function-pointer implementation
161 auto localPos = functionAddressToDeviceIndexMap.find(dev_functionPtr);
163 if(localPos != functionAddressToDeviceIndexMap.end()) {
164 return (*localPos).second;
167 int fIdx = num_device_functions;
168 host_function_table[num_device_functions] = dev_functionPtr;
169 functionAddressToDeviceIndexMap[dev_functionPtr] = num_device_functions;
170 num_device_functions++;
172 device_function_table, host_function_table, num_device_functions * sizeof(void *), 0, cudaMemcpyHostToDevice);
175 host_timeHist[fIdx] = 0;
176 MEMCPY_TO_SYMBOL(timeHistogram, host_timeHist, 10000 * sizeof(fptype), 0);
182 __host__ void GooPdf::initialize(std::vector<unsigned int> pindices, void *dev_functionPtr) {
184 setFitControl(std::make_shared<UnbinnedNllFit>());
186 // MetricTaker must be created after PdfBase initialisation is done.
187 PdfBase::initializeIndices(pindices);
189 functionIdx = findFunctionIdx(dev_functionPtr);
193 __host__ void GooPdf::setDebugMask(int mask, bool setSpecific) const {
195 #if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
199 debugParamIndex = parameters;
202 MEMCPY_TO_SYMBOL(gpuDebug, &cpuDebug, sizeof(int), 0, cudaMemcpyHostToDevice);
205 MEMCPY_TO_SYMBOL(debugParamIndex, ¶meters, sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
210 __host__ void GooPdf::setMetrics() {
211 logger = std::make_shared<MetricTaker>(this, getMetricPointer(fitControl->getMetric()));
214 __host__ double GooPdf::sumOfNll(int numVars) const {
215 static thrust::plus<double> cudaPlus;
216 thrust::constant_iterator<int> eventSize(numVars);
217 thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
220 // if (host_callnumber >= 2) GooFit::abort(__FILE__, __LINE__, getName() + " debug abort", this);
221 thrust::counting_iterator<int> eventIndex(0);
225 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
226 goofit_policy my_policy;
227 double r = thrust::transform_reduce(
229 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
230 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
235 double r = thrust::transform_reduce(
236 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
237 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
243 MPI_Allreduce(&r, &ret, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
245 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
246 goofit_policy my_policy;
247 ret = thrust::transform_reduce(
249 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
250 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
255 ret = thrust::transform_reduce(
256 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
257 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
266 __host__ double GooPdf::calculateNLL() const {
267 // if (cpuDebug & 1) std::cout << getName() << " entering calculateNLL (" << host_callnumber << ")" << std::endl;
269 // MEMCPY_TO_SYMBOL(callnumber, &host_callnumber, sizeof(int));
270 // int oldMask = cpuDebug;
271 // if (0 == host_callnumber) setDebugMask(0, false);
272 // std::cout << "Start norm " << getName() << std::endl;
274 // std::cout << "Norm done\n";
275 // if ((0 == host_callnumber) && (1 == oldMask)) setDebugMask(1, false);
277 // if (cpuDebug & 1) {
278 // std::cout << "Norm factors: ";
279 // for (int i = 0; i < totalParams; ++i) std::cout << host_normalisation[i] << " ";
280 // std::cout << std::endl;
283 if(host_normalisation[parameters] <= 0)
284 GooFit::abort(__FILE__, __LINE__, getName() + " non-positive normalisation", this);
286 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
287 cudaDeviceSynchronize(); // Ensure normalisation integrals are finished
289 int numVars = observables.size();
291 if(fitControl->binnedFit()) {
296 fptype ret = sumOfNll(numVars);
299 GooFit::abort(__FILE__, __LINE__, getName() + " zero NLL", this);
301 // if (cpuDebug & 1) std::cout << "Full NLL " << host_callnumber << " : " << 2*ret << std::endl;
304 // if ((cpuDebug & 1) && (host_callnumber >= 1)) GooFit::abort(__FILE__, __LINE__, getName() + " debug abort",
309 __host__ std::vector<fptype> GooPdf::evaluateAtPoints(Observable var) {
312 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
313 UnbinnedDataSet tempdata(observables);
315 double step = var.getBinSize();
317 for(int i = 0; i < var.getNumBins(); ++i) {
318 var.setValue(var.getLowerLimit() + (i + 0.5) * step);
322 auto old = getData();
325 thrust::counting_iterator<int> eventIndex(0);
326 thrust::constant_iterator<int> eventSize(observables.size());
327 thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
328 thrust::device_vector<fptype> results(var.getNumBins());
330 MetricTaker evalor(this, getMetricPointer("ptr_to_Eval"));
333 thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
334 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + m_iEventsPerTask, arrayAddress, eventSize)),
338 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
339 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
344 // Note, This is not fully realized with MPI. We need to copy each 'results' buffer to each other 'MPI_Scatterv',
345 // then we can do the rest.
346 thrust::host_vector<fptype> h_results = results;
347 std::vector<fptype> res;
348 res.resize(var.getNumBins());
350 for(int i = 0; i < var.getNumBins(); ++i) {
351 res[i] = h_results[i] * host_normalisation[parameters];
358 __host__ void GooPdf::scan(Observable var, std::vector<fptype> &values) {
359 fptype step = var.getUpperLimit();
360 step -= var.getLowerLimit();
361 step /= var.getNumBins();
364 for(fptype v = var.getLowerLimit() + 0.5 * step; v < var.getUpperLimit(); v += step) {
367 fptype curr = calculateNLL();
368 values.push_back(curr);
372 // TODO: is this needed?
373 __host__ void GooPdf::setParameterConstantness(bool constant) {
374 std::vector<Variable> pars = getParameters();
376 for(Variable &p : pars) {
377 p.setFixed(constant);
381 __host__ fptype GooPdf::getValue(EvalFunc evalfunc) {
382 // Returns the value of the PDF at a single point.
383 // Execute redundantly in all threads for OpenMP multiGPU case
386 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
388 UnbinnedDataSet point(observables);
390 auto old = getData();
393 thrust::counting_iterator<int> eventIndex(0);
394 thrust::constant_iterator<int> eventSize(observables.size());
395 thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
396 thrust::device_vector<fptype> results(1);
398 MetricTaker evalor(this, getMetricPointer(evalfunc));
399 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
400 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + 1, arrayAddress, eventSize)),
408 __host__ fptype GooPdf::normalize() const {
409 if(!fitControl->metricIsPdf()) {
410 GOOFIT_TRACE("{}: metricIsPdf, returning 1", getName());
411 host_normalisation[parameters] = 1.0;
417 if(hasAnalyticIntegral()) {
418 // Loop goes only over observables of this PDF.
419 for(const Observable &v : observables) {
420 GOOFIT_TRACE("{}: Analytically integrating over {}", getName(), v.getName());
421 ret *= integrate(v.getLowerLimit(), v.getUpperLimit());
424 host_normalisation[parameters] = 1.0 / ret;
425 GOOFIT_TRACE("{}: Param {} integral is = {}", getName(), parameters, ret);
430 GOOFIT_TRACE("{}, Computing integral without analytic help", getName());
434 for(const Observable &v : observables) {
435 ret *= v.getUpperLimit() - v.getLowerLimit();
436 totalBins *= integrationBins > 0 ? integrationBins : v.getNumBins();
438 GOOFIT_TRACE("Total bins {} due to {} {} {}", totalBins, v.getName(), integrationBins, v.getNumBins());
444 static thrust::plus<fptype> cudaPlus;
445 thrust::constant_iterator<fptype *> arrayAddress(normRanges);
446 thrust::constant_iterator<int> eventSize(observables.size());
447 thrust::counting_iterator<int> binIndex(0);
451 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
452 goofit_policy my_policy;
453 fptype s = thrust::transform_reduce(
455 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
456 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, eventSize, arrayAddress)),
461 fptype s = thrust::transform_reduce(
462 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
463 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, eventSize, arrayAddress)),
469 MPI_Allreduce(&s, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
471 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
472 goofit_policy my_policy;
473 sum = thrust::transform_reduce(
475 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
476 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, eventSize, arrayAddress)),
481 sum = thrust::transform_reduce(
482 thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
483 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, eventSize, arrayAddress)),
490 if(std::isnan(sum)) {
491 GooFit::abort(__FILE__, __LINE__, getName() + " NaN in normalisation", this);
492 } else if(0 >= sum) {
493 GooFit::abort(__FILE__, __LINE__, "Non-positive normalisation", this);
499 GooFit::abort(__FILE__, __LINE__, "Zero integral");
501 GOOFIT_TRACE("{}: Param {} integral is ~= {}", getName(), parameters, ret);
502 host_normalisation[parameters] = 1.0 / ret;
507 __constant__ fptype conversion = (1.0 / CLOCKS_PER_SEC);
508 __device__ fptype callFunction(fptype *eventAddress, unsigned int functionIdx, unsigned int paramIdx) {
509 clock_t start = clock();
510 fptype ret = (*(reinterpret_cast<device_function_ptr>(device_function_table[functionIdx])))(
511 eventAddress, cudaArray, paramIndices + paramIdx);
512 clock_t stop = clock();
514 if((0 == THREADIDX + BLOCKIDX) && (stop > start)) {
515 // Avoid issue when stop overflows and start doesn't.
516 timeHistogram[functionIdx * 100 + paramIdx] += ((stop - start) * conversion);
517 // printf("Clock: %li %li %li | %u %f\n", (long) start, (long) stop, (long) (stop - start), functionIdx,
518 // timeHistogram[functionIdx]);
524 __device__ fptype callFunction(fptype *eventAddress, unsigned int functionIdx, unsigned int paramIdx) {
525 return (*(reinterpret_cast<device_function_ptr>(device_function_table[functionIdx])))(
526 eventAddress, cudaArray, paramIndices + paramIdx);
530 __host__ std::vector<std::vector<fptype>> GooPdf::getCompProbsAtDataPoints() {
534 MEMCPY_TO_SYMBOL(normalisationFactors, host_normalisation, totalParams * sizeof(fptype), 0, cudaMemcpyHostToDevice);
536 int numVars = observables.size();
538 if(fitControl->binnedFit()) {
543 thrust::device_vector<fptype> results(numEntries);
544 thrust::constant_iterator<int> eventSize(numVars);
545 thrust::constant_iterator<fptype *> arrayAddress(dev_event_array);
546 thrust::counting_iterator<int> eventIndex(0);
547 MetricTaker evalor(this, getMetricPointer("ptr_to_Prob"));
548 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, arrayAddress, eventSize)),
549 thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
552 std::vector<std::vector<fptype>> values;
553 values.resize(components.size() + 1);
554 thrust::host_vector<fptype> host_results = results;
556 for(unsigned int i = 0; i < host_results.size(); ++i) {
557 values[0].push_back(host_results[i]);
560 for(unsigned int i = 0; i < components.size(); ++i) {
561 MetricTaker compevalor(components[i], getMetricPointer("ptr_to_Prob"));
562 thrust::counting_iterator<int> ceventIndex(0);
564 thrust::make_zip_iterator(thrust::make_tuple(ceventIndex, arrayAddress, eventSize)),
565 thrust::make_zip_iterator(thrust::make_tuple(ceventIndex + numEntries, arrayAddress, eventSize)),
568 host_results = results;
570 for(unsigned int j = 0; j < host_results.size(); ++j) {
571 values[1 + i].push_back(host_results[j]);
577 __host__ UnbinnedDataSet GooPdf::makeGrid() {
578 std::vector<Observable> ret = getObservables();
580 UnbinnedDataSet grid{ret};
586 // still need to add OpenMP/multi-GPU code here
587 __host__ void GooPdf::transformGrid(fptype *host_output) {
592 for(const Observable &v : observables) {
593 totalBins *= v.getNumBins();
596 thrust::constant_iterator<fptype *> arrayAddress(normRanges);
597 thrust::constant_iterator<int> eventSize(observables.size());
598 thrust::counting_iterator<int> binIndex(0);
599 thrust::device_vector<fptype> d_vec;
600 d_vec.resize(totalBins);
602 thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(binIndex, eventSize, arrayAddress)),
603 thrust::make_zip_iterator(thrust::make_tuple(binIndex + totalBins, eventSize, arrayAddress)),
607 thrust::host_vector<fptype> h_vec = d_vec;
609 for(unsigned int i = 0; i < totalBins; ++i)
610 host_output[i] = h_vec[i];
613 __host__ void GooPdf::setFitControl(std::shared_ptr<FitControl> fc) {
614 for(auto &component : components) {
615 component->setFitControl(fc);
624 __host__ TH1D *GooPdf::plotToROOT(Observable var, double normFactor, std::string name) {
626 name = getName() + "_hist";
628 auto ret = new TH1D(name.c_str(), "", var.getNumBins(), var.getLowerLimit(), var.getUpperLimit());
629 std::vector<fptype> binValues = evaluateAtPoints(var);
633 for(int i = 0; i < var.getNumBins(); ++i) {
634 pdf_int += binValues[i];
637 for(int i = 0; i < var.getNumBins(); ++i)
638 ret->SetBinContent(i + 1, binValues[i] * normFactor / pdf_int / var.getBinSize());
642 } // namespace GooFit