Chemical Data Processing Library C++ API - Version 1.1.1
MLRModel.hpp
Go to the documentation of this file.
1 /*
2  * MLRModel.hpp
3  *
4  * This file is part of the Chemical Data Processing Toolkit
5  *
6  * Copyright (C) 2003 Thomas Seidel <thomas.seidel@univie.ac.at>
7  *
8  * This library is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2 of the License, or (at your option) any later version.
12  *
13  * This library is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this library; see the file COPYING. If not, write to
20  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21  * Boston, MA 02111-1307, USA.
22  */
23 
29 #ifndef CDPL_MATH_MLRMODEL_HPP
30 #define CDPL_MATH_MLRMODEL_HPP
31 
32 #include <limits>
33 
34 #include "CDPL/Math/Vector.hpp"
35 #include "CDPL/Math/Matrix.hpp"
36 #include "CDPL/Math/CommonType.hpp"
37 #include "CDPL/Math/TypeTraits.hpp"
40 #include "CDPL/Base/Exceptions.hpp"
41 
42 
43 namespace CDPL
44 {
45 
46  namespace Math
47  {
48 
78  template <typename T = double>
79  class MLRModel
80  {
81 
82  public:
84  typedef T ValueType;
87 
92  chiSquare(0), Q(0), r(0), stdDeviation(0) {}
93 
99  void resizeDataSet(SizeType num_points, SizeType num_vars);
100 
106  void clearDataSet();
107 
121  template <typename V>
122  void setXYData(SizeType i, const VectorExpression<V>& x_vars, ValueType y);
123 
138  template <typename V>
139  void addXYData(const VectorExpression<V>& x_vars, ValueType y);
140 
147 
153  const MatrixType& getXMatrix() const;
154 
161 
167  const VectorType& getYValues() const;
168 
175  void buildModel();
176 
185  template <typename V>
187 
199  template <typename V>
201 
207  const VectorType& getCoefficients() const;
208 
221  ValueType getChiSquare() const;
222 
238  ValueType getGoodnessOfFit() const;
239 
262 
276 
285  void calcStatistics();
286 
287  private:
288  MatrixType xMatrix;
289  VectorType yValues;
290  VectorType calcYValues;
291  VectorType coefficients;
292  MatrixType svdU;
293  MatrixType svdV;
294  VectorType svdW;
295  ValueType chiSquare;
296  ValueType Q;
297  ValueType r;
298  ValueType stdDeviation;
299  };
300  } // namespace Math
301 } // namespace CDPL
302 
303 
304 // Implementation
305 
306 template <typename T>
308 {
309  if (num_points == xMatrix.getSize1() && num_vars == xMatrix.getSize2())
310  return;
311 
312  xMatrix.resize(num_points, num_vars, true, ValueType());
313  yValues.resize(num_points, ValueType());
314 }
315 
316 template <typename T>
318 {
319  yValues.resize(0);
320  xMatrix.resize(0, 0, false);
321 }
322 
323 template <typename T>
324 template <typename V>
326 {
327  SizeType x_mtx_size1 = xMatrix.getSize1();
328  SizeType x_mtx_size2 = xMatrix.getSize2();
329  SizeType x_vars_size = x_vars().getSize();
330  SizeType y_vals_size = yValues.getSize();
331 
332  resizeDataSet(std::max(i + 1, std::max(x_mtx_size1, y_vals_size)), std::max(x_vars_size, x_mtx_size2));
333 
334  for (SizeType j = 0; j < x_vars_size; j++)
335  xMatrix(i, j) = x_vars()(j);
336 
337  if (x_vars_size < x_mtx_size2)
338  for (SizeType j = x_vars_size; j < x_mtx_size2; j++)
339  xMatrix(i, j) = ValueType();
340 
341  yValues(i) = y;
342 }
343 
344 template <typename T>
345 template <typename V>
347 {
348  SizeType i = xMatrix.getSize1();
349  SizeType x_mtx_size2 = xMatrix.getSize2();
350  SizeType x_vars_size = x_vars().getSize();
351 
352  resizeDataSet(i + 1, std::max(x_mtx_size2, x_vars_size));
353 
354  for (SizeType j = 0; j < x_vars_size; j++)
355  xMatrix(i, j) = x_vars()(j);
356 
357  if (x_vars_size < x_mtx_size2)
358  for (SizeType j = x_vars_size; j < x_mtx_size2; j++)
359  xMatrix(i, j) = ValueType();
360 
361  yValues(i) = y;
362 }
363 
364 template <typename T>
367 {
368  return xMatrix;
369 }
370 
371 template <typename T>
374 {
375  return xMatrix;
376 }
377 
378 template <typename T>
381 {
382  return yValues;
383 }
384 
385 template <typename T>
388 {
389  return yValues;
390 }
391 
392 template <typename T>
394 {
395  static const ValueType TOLERANCE(1.0e-6);
396 
397  SizeType n = xMatrix.getSize1();
398  SizeType m = xMatrix.getSize2();
399 
400  if (m == 0 || n == 0)
401  throw Base::CalculationFailed("MLRModel: empty data set");
402 
403  if (n != SizeType(yValues.getSize()))
404  resizeDataSet(std::max(SizeType(yValues.getSize()), n), m);
405 
406  svdU.resize(n, m, false);
407  svdV.resize(m, m, false);
408  svdW.resize(m, false);
409 
410  svdU = xMatrix;
411 
412  if (!svDecompose(svdU, svdW, svdV))
413  throw Base::CalculationFailed("MLRModel: singular value decomposition failed");
414 
415  ValueType w_max = svdW(0);
416 
417  for (SizeType i = 1; i < m; i++)
418  if (svdW(i) > w_max)
419  w_max = svdW(i);
420 
421  ValueType tresh = TOLERANCE * w_max;
422 
423  for (SizeType i = 0; i < m; i++)
424  if (svdW(i) < tresh)
425  svdW(i) = 0;
426 
427  coefficients.resize(m, false);
428 
429  svSubstitute(svdU, svdW, svdV, yValues, coefficients);
430 }
431 
432 template <typename T>
433 template <typename V>
436 {
437  if (SizeType(x().getSize()) != SizeType(coefficients.getSize()))
438  throw Base::CalculationFailed("MLRModel: number of regression coefficients does not match number of independent variables");
439 
440  return innerProd(coefficients, x);
441 }
442 
443 template <typename T>
444 template <typename V>
447 {
448  return calcYValue(x);
449 }
450 
451 template <typename T>
454 {
455  return coefficients;
456 }
457 
458 template <typename T>
461 {
462  return chiSquare;
463 }
464 
465 template <typename T>
468 {
469  return Q;
470 }
471 
472 template <typename T>
475 {
476  return r;
477 }
478 
479 template <typename T>
482 {
483  return stdDeviation;
484 }
485 
486 template <typename T>
488 {
489  ValueType mean_data_y = 0;
490  ValueType mean_calc_y = 0;
491 
492  chiSquare = ValueType();
493 
494  SizeType m = xMatrix.getSize2();
495  SizeType n = xMatrix.getSize1();
496 
497  if (m != SizeType(coefficients.getSize()))
498  throw Base::CalculationFailed("MLRModel: number of independent variables does not match number of regression coefficients");
499 
500  if (n != SizeType(yValues.getSize()))
501  throw Base::CalculationFailed("MLRModel: number of dependent variables does not match number of vectors with independent variables");
502 
503  calcYValues.resize(n);
504 
505  for (SizeType i = 0; i < n; i++) {
506  ValueType data_y = yValues(i);
507  ValueType calc_y = innerProd(row(xMatrix, i), coefficients);
508  ValueType y_diff = data_y - calc_y;
509 
510  mean_data_y += data_y;
511  mean_calc_y += calc_y;
512  chiSquare += y_diff * y_diff;
513 
514  calcYValues(i) = calc_y;
515  }
516 
517  mean_data_y /= n;
518  mean_calc_y /= n;
519 
520  ValueType sxx = ValueType();
521  ValueType syy = ValueType();
522  ValueType sxy = ValueType();
523 
524  for (SizeType i = 0; i < n; i++) {
525  ValueType xt = yValues(i) - mean_data_y;
526  ValueType yt = calcYValues(i) - mean_calc_y;
527 
528  sxx += xt * xt;
529  syy += yt * yt;
530  sxy += xt * yt;
531  }
532 
533  r = sxy / (TypeTraits<ValueType>::sqrt(sxx * syy) + std::numeric_limits<ValueType>::epsilon());
534 
535  stdDeviation = TypeTraits<ValueType>::sqrt(chiSquare / (n - m));
536 
537  Q = gammaQ(ValueType(n - 2) / 2, chiSquare / 2);
538 }
539 
540 #endif // CDPL_MATH_MLRMODEL_HPP
CDPL::Chem::AtomType::Q
const unsigned int Q
A generic type that covers any element except hydrogen and carbon.
Definition: AtomType.hpp:627
CDPL::Math::MLRModel::resizeDataSet
void resizeDataSet(SizeType num_points, SizeType num_vars)
Resizes the data set to hold num_points data points with num_vars independent variables.
Definition: MLRModel.hpp:307
CDPL::Math::TypeTraits
Definition: TypeTraits.hpp:171
CDPL::Math::Vector< T >
CDPL::Math::MLRModel::MatrixType
Matrix< T > MatrixType
Definition: MLRModel.hpp:85
CDPL::Math::MLRModel::SizeType
CommonType< typename Vector< T >::SizeType, typename Matrix< T >::SizeType >::Type SizeType
Definition: MLRModel.hpp:83
CommonType.hpp
Common type deduction.
CDPL::Math::MLRModel::setXYData
void setXYData(SizeType i, const VectorExpression< V > &x_vars, ValueType y)
Sets the i-th data point of the data set.
Definition: MLRModel.hpp:325
CDPL::Math::VectorExpression
Definition: Expression.hpp:54
CDPL::Math::MLRModel
Performs Multiple Linear Regression [WLIREG] on a set of data points .
Definition: MLRModel.hpp:80
CDPL::Math::innerProd
VectorInnerProduct< E1, E2 >::ResultType innerProd(const VectorExpression< E1 > &e1, const VectorExpression< E2 > &e2)
Definition: VectorExpression.hpp:504
CDPL::Math::MLRModel::getCoefficients
const VectorType & getCoefficients() const
Returns a read-only vector containing the estimated regression coefficients which were calculated by...
Definition: MLRModel.hpp:453
CDPL::Math::MLRModel::clearDataSet
void clearDataSet()
Clears the data set.
Definition: MLRModel.hpp:317
CDPL::Math::MLRModel::calcYValue
ValueType calcYValue(const VectorExpression< V > &x_vars) const
Predicts the value of the dependent variable for a vector of independent variables given by x_vars.
CDPL::Math::MLRModel::getCorrelationCoefficient
ValueType getCorrelationCoefficient() const
Returns the correlation coefficient .
Definition: MLRModel.hpp:474
CDPL::Math::MLRModel::VectorType
Vector< T > VectorType
Definition: MLRModel.hpp:86
CDPL::Math::MLRModel::MLRModel
MLRModel()
Constructs and initializes a regression model with an empty data set.
Definition: MLRModel.hpp:91
CDPL::Math::MLRModel::getGoodnessOfFit
ValueType getGoodnessOfFit() const
Returns the goodness of fit .
Definition: MLRModel.hpp:467
CDPL::Math::CommonType
Definition: CommonType.hpp:41
CDPL::Math::MLRModel::addXYData
void addXYData(const VectorExpression< V > &x_vars, ValueType y)
Adds a new data point to the current data set.
Definition: MLRModel.hpp:346
CDPL::Math::MLRModel::getXMatrix
MatrixType & getXMatrix()
Returns a matrix where each row represents the vector with independent variables of the currently st...
Definition: MLRModel.hpp:366
CDPL::Math::gammaQ
T gammaQ(const T &a, const T &x)
Computes the incomplete gamma function (see [NRIC] for details).
CDPL::Pharm::FeatureProperty::TOLERANCE
CDPL_PHARM_API const Base::LookupKey TOLERANCE
TypeTraits.hpp
Definition of type traits.
CDPL::Math::MLRModel::getYValues
VectorType & getYValues()
Returns a vector containing the dependent variables of the currently stored data points .
Definition: MLRModel.hpp:380
CDPL::Math::MLRModel::operator()
ValueType operator()(const VectorExpression< V > &x_vars) const
Predicts the value of the dependent variable for a vector of independent variables given by x_vars.
CDPL::Math::MLRModel::getStandardDeviation
ValueType getStandardDeviation() const
Returns the standard deviation of the residuals .
Definition: MLRModel.hpp:481
CDPL::Chem::CIPDescriptor::m
const unsigned int m
Specifies that the stereocenter has m configuration.
Definition: CIPDescriptor.hpp:116
SVDecomposition.hpp
Implementation of matrix singular value decomposition and associated operations.
CDPL::Base::CalculationFailed
Thrown to indicate that some requested calculation has failed.
Definition: Base/Exceptions.hpp:230
CDPL::Chem::AtomType::T
const unsigned int T
Specifies Hydrogen (Tritium).
Definition: AtomType.hpp:67
Exceptions.hpp
Definition of exception classes.
CDPL::Chem::CIPDescriptor::r
const unsigned int r
Specifies that the stereocenter has r configuration.
Definition: CIPDescriptor.hpp:76
CDPL::Math::MLRModel::getChiSquare
ValueType getChiSquare() const
Returns the sum of squared residuals .
Definition: MLRModel.hpp:460
CDPL::Math::Matrix
Definition: Matrix.hpp:280
SpecialFunctions.hpp
Provides miscellaneous special mathematical functions.
CDPL
The namespace of the Chemical Data Processing Library.
CDPL::Math::MLRModel::ValueType
T ValueType
Definition: MLRModel.hpp:84
CDPL::Math::MLRModel::buildModel
void buildModel()
Performs linear least squares regression modeling of the set of currently stored data points .
Definition: MLRModel.hpp:393
CDPL::Math::row
MatrixRow< M > row(MatrixExpression< M > &e, typename MatrixRow< M >::SizeType i)
Definition: MatrixProxy.hpp:716
Matrix.hpp
Definition of matrix data types.
CDPL::Math::MLRModel::calcStatistics
void calcStatistics()
Calculates various statistical parameters describing the built regression model.
Definition: MLRModel.hpp:487
CDPL::Math::svSubstitute
void svSubstitute(const MatrixExpression< U > &u, const VectorExpression< W > &w, const MatrixExpression< V > &v, const VectorExpression< B > &b, VectorExpression< X > &x)
Solves for a vector where is given by its Singular Value Decomposition [WSVD].
Definition: SVDecomposition.hpp:466
CDPL::Math::svDecompose
bool svDecompose(MatrixExpression< A > &a, VectorExpression< W > &w, MatrixExpression< V > &v, std::size_t max_iter=0)
Computes the Singular Value Decomposition [WSVD] of a -dimensional matrix a.
Definition: SVDecomposition.hpp:70
Vector.hpp
Definition of vector data types.