Chemical Data Processing Library C++ API - Version 1.2.1
MLRModel.hpp
Go to the documentation of this file.
1 /*
2  * MLRModel.hpp
3  *
4  * This file is part of the Chemical Data Processing Toolkit
5  *
6  * Copyright (C) 2003 Thomas Seidel <thomas.seidel@univie.ac.at>
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this library; see the file COPYING. If not, write to
20  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21  * Boston, MA 02111-1307, USA.
22  */
23 
29 #ifndef CDPL_MATH_MLRMODEL_HPP
30 #define CDPL_MATH_MLRMODEL_HPP
31 
32 #include <limits>
33 
34 #include "CDPL/Math/Vector.hpp"
35 #include "CDPL/Math/Matrix.hpp"
36 #include "CDPL/Math/CommonType.hpp"
37 #include "CDPL/Math/TypeTraits.hpp"
40 #include "CDPL/Base/Exceptions.hpp"
41 
42 
43 namespace CDPL
44 {
45 
46  namespace Math
47  {
48 
78  template <typename T = double>
79  class MLRModel
80  {
81 
82  public:
84  typedef T ValueType;
87 
92  chiSquare(0), Q(0), r(0), stdDeviation(0) {}
93 
99  void resizeDataSet(SizeType num_points, SizeType num_vars);
100 
106  void clearDataSet();
107 
121  template <typename V>
122  void setXYData(SizeType i, const VectorExpression<V>& x_vars, ValueType y);
123 
138  template <typename V>
139  void addXYData(const VectorExpression<V>& x_vars, ValueType y);
140 
147 
153  const MatrixType& getXMatrix() const;
154 
161 
167  const VectorType& getYValues() const;
168 
175  void buildModel();
176 
185  template <typename V>
187 
199  template <typename V>
201 
207  const VectorType& getCoefficients() const;
208 
221  ValueType getChiSquare() const;
222 
238  ValueType getGoodnessOfFit() const;
239 
262 
276 
285  void calcStatistics();
286 
287  private:
288  MatrixType xMatrix;
289  VectorType yValues;
290  VectorType calcYValues;
291  VectorType coefficients;
292  MatrixType svdU;
293  MatrixType svdV;
294  VectorType svdW;
295  ValueType chiSquare;
296  ValueType Q;
297  ValueType r;
298  ValueType stdDeviation;
299  };
300  } // namespace Math
301 } // namespace CDPL
302 
303 
304 // Implementation
305 
306 template <typename T>
308 {
309  if (num_points == xMatrix.getSize1() && num_vars == xMatrix.getSize2())
310  return;
311 
312  xMatrix.resize(num_points, num_vars, true, ValueType());
313  yValues.resize(num_points, ValueType());
314 }
315 
316 template <typename T>
318 {
319  yValues.resize(0);
320  xMatrix.resize(0, 0, false);
321 }
322 
323 template <typename T>
324 template <typename V>
326 {
327  SizeType x_mtx_size1 = xMatrix.getSize1();
328  SizeType x_mtx_size2 = xMatrix.getSize2();
329  SizeType x_vars_size = x_vars().getSize();
330  SizeType y_vals_size = yValues.getSize();
331 
332  resizeDataSet(std::max(i + 1, std::max(x_mtx_size1, y_vals_size)), std::max(x_vars_size, x_mtx_size2));
333 
334  for (SizeType j = 0; j < x_vars_size; j++)
335  xMatrix(i, j) = x_vars()(j);
336 
337  if (x_vars_size < x_mtx_size2)
338  for (SizeType j = x_vars_size; j < x_mtx_size2; j++)
339  xMatrix(i, j) = ValueType();
340 
341  yValues(i) = y;
342 }
343 
344 template <typename T>
345 template <typename V>
347 {
348  SizeType i = xMatrix.getSize1();
349  SizeType x_mtx_size2 = xMatrix.getSize2();
350  SizeType x_vars_size = x_vars().getSize();
351 
352  resizeDataSet(i + 1, std::max(x_mtx_size2, x_vars_size));
353 
354  for (SizeType j = 0; j < x_vars_size; j++)
355  xMatrix(i, j) = x_vars()(j);
356 
357  if (x_vars_size < x_mtx_size2)
358  for (SizeType j = x_vars_size; j < x_mtx_size2; j++)
359  xMatrix(i, j) = ValueType();
360 
361  yValues(i) = y;
362 }
363 
364 template <typename T>
367 {
368  return xMatrix;
369 }
370 
371 template <typename T>
374 {
375  return xMatrix;
376 }
377 
378 template <typename T>
381 {
382  return yValues;
383 }
384 
385 template <typename T>
388 {
389  return yValues;
390 }
391 
392 template <typename T>
394 {
395  static const ValueType TOLERANCE(1.0e-6);
396 
397  SizeType n = xMatrix.getSize1();
398  SizeType m = xMatrix.getSize2();
399 
400  if (m == 0 || n == 0)
401  throw Base::CalculationFailed("MLRModel: empty data set");
402 
403  if (n != SizeType(yValues.getSize()))
404  resizeDataSet(std::max(SizeType(yValues.getSize()), n), m);
405 
406  svdU.resize(n, m, false);
407  svdV.resize(m, m, false);
408  svdW.resize(m, false);
409 
410  svdU = xMatrix;
411 
412  if (!svDecompose(svdU, svdW, svdV))
413  throw Base::CalculationFailed("MLRModel: singular value decomposition failed");
414 
415  ValueType w_max = svdW(0);
416 
417  for (SizeType i = 1; i < m; i++)
418  if (svdW(i) > w_max)
419  w_max = svdW(i);
420 
421  ValueType tresh = TOLERANCE * w_max;
422 
423  for (SizeType i = 0; i < m; i++)
424  if (svdW(i) < tresh)
425  svdW(i) = 0;
426 
427  coefficients.resize(m, false);
428 
429  svSubstitute(svdU, svdW, svdV, yValues, coefficients);
430 }
431 
432 template <typename T>
433 template <typename V>
436 {
437  if (SizeType(x().getSize()) != SizeType(coefficients.getSize()))
438  throw Base::CalculationFailed("MLRModel: number of regression coefficients does not match number of independent variables");
439 
440  return innerProd(coefficients, x);
441 }
442 
443 template <typename T>
444 template <typename V>
447 {
448  return calcYValue(x);
449 }
450 
451 template <typename T>
454 {
455  return coefficients;
456 }
457 
458 template <typename T>
461 {
462  return chiSquare;
463 }
464 
465 template <typename T>
468 {
469  return Q;
470 }
471 
472 template <typename T>
475 {
476  return r;
477 }
478 
479 template <typename T>
482 {
483  return stdDeviation;
484 }
485 
486 template <typename T>
488 {
489  ValueType mean_data_y = 0;
490  ValueType mean_calc_y = 0;
491 
492  chiSquare = ValueType();
493 
494  SizeType m = xMatrix.getSize2();
495  SizeType n = xMatrix.getSize1();
496 
497  if (m != SizeType(coefficients.getSize()))
498  throw Base::CalculationFailed("MLRModel: number of independent variables does not match number of regression coefficients");
499 
500  if (n != SizeType(yValues.getSize()))
501  throw Base::CalculationFailed("MLRModel: number of dependent variables does not match number of vectors with independent variables");
502 
503  calcYValues.resize(n);
504 
505  for (SizeType i = 0; i < n; i++) {
506  ValueType data_y = yValues(i);
507  ValueType calc_y = innerProd(row(xMatrix, i), coefficients);
508  ValueType y_diff = data_y - calc_y;
509 
510  mean_data_y += data_y;
511  mean_calc_y += calc_y;
512  chiSquare += y_diff * y_diff;
513 
514  calcYValues(i) = calc_y;
515  }
516 
517  mean_data_y /= n;
518  mean_calc_y /= n;
519 
520  ValueType sxx = ValueType();
521  ValueType syy = ValueType();
522  ValueType sxy = ValueType();
523 
524  for (SizeType i = 0; i < n; i++) {
525  ValueType xt = yValues(i) - mean_data_y;
526  ValueType yt = calcYValues(i) - mean_calc_y;
527 
528  sxx += xt * xt;
529  syy += yt * yt;
530  sxy += xt * yt;
531  }
532 
533  r = sxy / (TypeTraits<ValueType>::sqrt(sxx * syy) + std::numeric_limits<ValueType>::epsilon());
534 
535  stdDeviation = TypeTraits<ValueType>::sqrt(chiSquare / (n - m));
536 
537  Q = gammaQ(ValueType(n - 2) / 2, chiSquare / 2);
538 }
539 
540 #endif // CDPL_MATH_MLRMODEL_HPP
Definition of exception classes.
Common type deduction.
Definition of matrix data types.
Implementation of matrix singular value decomposition and associated operations.
Provides miscellaneous special mathematical functions.
Definition of type traits.
Definition of vector data types.
Thrown to indicate that some requested calculation has failed.
Definition: Base/Exceptions.hpp:230
Performs Multiple Linear Regression [WLIREG] on a set of data points .
Definition: MLRModel.hpp:80
ValueType operator()(const VectorExpression< V > &x_vars) const
Predicts the value of the dependent variable for a vector of independent variables given by x_vars.
ValueType getCorrelationCoefficient() const
Returns the correlation coefficient .
Definition: MLRModel.hpp:474
MLRModel()
Constructs and initializes a regression model with an empty data set.
Definition: MLRModel.hpp:91
const VectorType & getCoefficients() const
Returns a read-only vector containing the estimated regression coefficients which were calculated by...
Definition: MLRModel.hpp:453
void resizeDataSet(SizeType num_points, SizeType num_vars)
Resizes the data set to hold num_points data points with num_vars independent variables.
Definition: MLRModel.hpp:307
ValueType getGoodnessOfFit() const
Returns the goodness of fit .
Definition: MLRModel.hpp:467
CommonType< typename Vector< T >::SizeType, typename Matrix< T >::SizeType >::Type SizeType
Definition: MLRModel.hpp:83
void buildModel()
Performs linear least squares regression modeling of the set of currently stored data points .
Definition: MLRModel.hpp:393
ValueType getStandardDeviation() const
Returns the standard deviation of the residuals .
Definition: MLRModel.hpp:481
void setXYData(SizeType i, const VectorExpression< V > &x_vars, ValueType y)
Sets the i-th data point of the data set.
Definition: MLRModel.hpp:325
Vector< T > VectorType
Definition: MLRModel.hpp:86
void calcStatistics()
Calculates various statistical parameters describing the built regression model.
Definition: MLRModel.hpp:487
void clearDataSet()
Clears the data set.
Definition: MLRModel.hpp:317
VectorType & getYValues()
Returns a vector containing the dependent variables of the currently stored data points .
Definition: MLRModel.hpp:380
T ValueType
Definition: MLRModel.hpp:84
ValueType calcYValue(const VectorExpression< V > &x_vars) const
Predicts the value of the dependent variable for a vector of independent variables given by x_vars.
ValueType getChiSquare() const
Returns the sum of squared residuals .
Definition: MLRModel.hpp:460
MatrixType & getXMatrix()
Returns a matrix where each row represents the vector with independent variables of the currently st...
Definition: MLRModel.hpp:366
Matrix< T > MatrixType
Definition: MLRModel.hpp:85
void addXYData(const VectorExpression< V > &x_vars, ValueType y)
Adds a new data point to the current data set.
Definition: MLRModel.hpp:346
Definition: Matrix.hpp:280
A::size_type SizeType
Definition: Matrix.hpp:288
Definition: Expression.hpp:54
Definition: Vector.hpp:258
constexpr unsigned int T
Specifies Hydrogen (Tritium).
Definition: AtomType.hpp:67
constexpr unsigned int Q
A generic type that covers any element except hydrogen and carbon.
Definition: AtomType.hpp:647
constexpr unsigned int r
Specifies that the stereocenter has r configuration.
Definition: CIPDescriptor.hpp:76
constexpr unsigned int m
Specifies that the stereocenter has m configuration.
Definition: CIPDescriptor.hpp:116
VectorInnerProduct< E1, E2 >::ResultType innerProd(const VectorExpression< E1 > &e1, const VectorExpression< E2 > &e2)
Definition: VectorExpression.hpp:504
MatrixRow< M > row(MatrixExpression< M > &e, typename MatrixRow< M >::SizeType i)
Definition: MatrixProxy.hpp:716
void svSubstitute(const MatrixExpression< U > &u, const VectorExpression< W > &w, const MatrixExpression< V > &v, const VectorExpression< B > &b, VectorExpression< X > &x)
Solves for a vector where is given by its Singular Value Decomposition [WSVD].
Definition: SVDecomposition.hpp:466
bool svDecompose(MatrixExpression< A > &a, VectorExpression< W > &w, MatrixExpression< V > &v, std::size_t max_iter=0)
Computes the Singular Value Decomposition [WSVD] of a -dimensional matrix a.
Definition: SVDecomposition.hpp:70
T gammaQ(const T &a, const T &x)
Computes the incomplete gamma function (see [NRIC] for details).
CDPL_PHARM_API const Base::LookupKey TOLERANCE
The namespace of the Chemical Data Processing Library.
Definition: CommonType.hpp:41
Definition: TypeTraits.hpp:171