Chemical Data Processing Library C++ API - Version 1.4.0
ChEMBLStandardizer.hpp
Go to the documentation of this file.
1 /*
2  * ChEMBLStandardizer.hpp
3  *
4  * Implementation of the ChEMBL molecule standardization and parent compound extraction procedure
5  * (A. P. Bento et al., An open source chemical structure curation pipeline using RDKit, J. Cheminformatics 2020, 12, 51)
6  *
7  * This file is part of the Chemical Data Processing Toolkit
8  *
9  * Copyright (C) 2003 Thomas Seidel <thomas.seidel@univie.ac.at>
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * along with this library; see the file COPYING. If not, write to
23  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 02111-1307, USA.
25  */
26 
32 #ifndef CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
33 #define CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
34 
35 #include <vector>
36 #include <cstdint>
37 #include <utility>
38 #include <unordered_set>
39 #include <memory>
40 
41 #include <boost/functional/hash.hpp>
42 
43 #include "CDPL/Chem/APIPrefix.hpp"
45 #include "CDPL/Chem/Fragment.hpp"
50 #include "CDPL/Util/BitSet.hpp"
52 
53 
54 namespace CDPL
55 {
56 
57  namespace Chem
58  {
59 
65  {
66 
67  public:
69  typedef std::shared_ptr<ChEMBLStandardizer> SharedPointer;
70 
75  {
76 
78  NONE = 0x0,
80  EXCLUDED = 0x1,
82  EXPLICIT_HYDROGENS_REMOVED = 0x2,
84  UNKNOWN_STEREO_STANDARDIZED = 0x4,
86  BONDS_KEKULIZED = 0x8,
88  STRUCTURE_NORMALIZED = 0x10,
90  CHARGES_REMOVED = 0x20,
92  TARTRATE_STEREO_CLEARED = 0x40,
94  STRUCTURE_2D_CORRECTED = 0x80,
96  ISOTOPE_INFO_CLEARED = 0x100,
98  SALT_COMPONENTS_REMOVED = 0x200,
100  SOLVENT_COMPONENTS_REMOVED = 0x400,
102  DUPLICATE_COMPONENTS_REMOVED = 0x800
103  };
104 
109 
115 
122  ChangeFlags standardize(Molecule& mol, bool proc_excld = false);
123 
131  ChangeFlags standardize(const MolecularGraph& molgraph, Molecule& std_mol, bool proc_excluded = false);
132 
140  ChangeFlags getParent(Molecule& mol, bool neutralize = true, bool check_exclusion = true);
141 
150  ChangeFlags getParent(const MolecularGraph& molgraph, Molecule& parent_mol, bool neutralize = true, bool check_exclusion = true);
151 
158 
159  private:
160  typedef std::vector<Atom*> AtomList;
161 
162  void copyMolecularGraph(const MolecularGraph& molgraph, Molecule& mol_copy) const;
163 
164  bool checkExclusionCriterions(const Molecule& mol) const;
165  bool checkExclusionCriterions(const MolecularGraph& molgraph, std::size_t& boron_cnt) const;
166 
167  bool standardizeUnknownStereochemistry(Molecule& mol) const;
168 
169  bool kekulizeBonds(Molecule& mol);
170 
171  bool removeExplicitHydrogens(Molecule& mol) const;
172  bool isRemovableHydrogen(const Atom& atom) const;
173 
174  bool normalizeStructure(Molecule& mol);
175  const Chem::Atom* getAtomWithMappingID(const Molecule& ptn, std::size_t id) const;
176 
177  bool removeCharges(Molecule& mol);
178 
179  bool removeTartrateStereochemistry(Molecule& mol);
180 
181  bool cleanup2DStructure(Molecule& mol);
182  double calc2DBondAngle(const Molecule& mol, const Atom& ctr_atom, const Atom& nbr_atom1, const Atom& nbr_atom2);
183  void rotateSubstituent(const Molecule& mol, const Atom& ctr_atom, const Atom& subst_atom, double rot_ang);
184 
185  void clearMatchConstraints(Molecule& mol) const;
186 
187  typedef std::pair<std::uint64_t, std::uint64_t> StructureID;
188  typedef std::pair<const Fragment*, StructureID> MoleculeComponent;
189  typedef std::vector<MoleculeComponent> MoleculeComponentList;
190  typedef std::unordered_set<StructureID, boost::hash<StructureID> > StructureIDSet;
191 
192  HashCodeCalculator hashCodeCalc;
193  KekuleStructureCalculator kekuleStructureCalc;
194  Util::STArray kekulizedBondOrders;
195  SubstructureSearch substructSearch;
196  ProtonationStateStandardizer chargeStandardizer;
197  Math::Vector2DArray atom2DCoords;
198  Util::BitSet markedAtomSet;
199  Fragment tmpFragment;
200  BasicMolecule tmpMolecule;
201  MoleculeComponentList molCompList1;
202  MoleculeComponentList molCompList2;
203  StructureIDSet uniqueMolComps;
204  };
205  } // namespace Chem
206 } // namespace CDPL
207 
208 #endif // CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
Definition of class CDPL::Chem::BasicMolecule.
Declaration of type CDPL::Util::BitSet.
Definition of the preprocessor macro CDPL_CHEM_API.
#define CDPL_CHEM_API
Tells the compiler/linker which classes, functions and variables are part of the library API.
Definition of class CDPL::Chem::Fragment.
Definition of class CDPL::Chem::HashCodeCalculator.
Definition of class CDPL::Chem::KekuleStructureCalculator.
Definition of class CDPL::Chem::ProtonationStateStandardizer.
Definition of class CDPL::Chem::SubstructureSearch.
Definition of class CDPL::Math::VectorArray.
Abstract base class representing a chemical atom and its bonded neighborhood.
Definition: Atom.hpp:57
Concrete Chem::Molecule implementation that owns Chem::BasicAtom and Chem::BasicBond instances.
Definition: BasicMolecule.hpp:60
Implementation of the ChEMBL structure preprocessing pipeline.
Definition: ChEMBLStandardizer.hpp:65
ChangeFlags getParent(const MolecularGraph &molgraph, Molecule &parent_mol, bool neutralize=true, bool check_exclusion=true)
Extracts the parent compound of molgraph into parent_mol.
ChangeFlags standardize(Molecule &mol, bool proc_excld=false)
Standardizes mol in place.
ChangeFlags
Bitwise-OR-combined flags reporting which standardization steps modified the input molecule.
Definition: ChEMBLStandardizer.hpp:75
ChangeFlags standardize(const MolecularGraph &molgraph, Molecule &std_mol, bool proc_excluded=false)
Writes a standardized copy of molgraph to std_mol.
ChEMBLStandardizer()
Constructs the ChEMBLStandardizer instance.
std::shared_ptr< ChEMBLStandardizer > SharedPointer
A reference-counted smart pointer [SHPTR] for dynamically allocated ChEMBLStandardizer instances.
Definition: ChEMBLStandardizer.hpp:69
ChEMBLStandardizer & operator=(const ChEMBLStandardizer &standardizer)
Replaces the state of this standardizer by a copy of the state of standardizer.
ChEMBLStandardizer(const ChEMBLStandardizer &standardizer)
Constructs a copy of the ChEMBLStandardizer instance standardizer.
ChangeFlags getParent(Molecule &mol, bool neutralize=true, bool check_exclusion=true)
Extracts the parent compound of mol in place (removing salt/solvent components).
Concrete Chem::MolecularGraph implementation that stores references to a selectable subset of atoms a...
Definition: Fragment.hpp:57
Computes a 64-bit hash code that identifies a molecular graph up to a configurable set of atom and bo...
Definition: HashCodeCalculator.hpp:67
Assigns an alternating single/double bond pattern (Kekulé structure) to the previously undefined bond...
Definition: KekuleStructureCalculator.hpp:55
Abstract base class for representations of a chemical structure as a graph of bonded atoms.
Definition: MolecularGraph.hpp:57
Abstract base class representing a mutable molecular graph that owns its atoms and bonds.
Definition: Molecule.hpp:53
Adjusts the protonation state of a molecule (atom formal charges and bonded hydrogen counts) accordin...
Definition: ProtonationStateStandardizer.hpp:58
Subgraph-isomorphism search of a query molecular graph against a target molecular graph,...
Definition: SubstructureSearch.hpp:74
constexpr unsigned int NONE
Represents an empty set of atom properties.
Definition: Biomol/AtomPropertyFlag.hpp:48
CDPL_CHEM_API void kekulizeBonds(MolecularGraph &molgraph)
Assigns Kekulé bond orders to the aromatic bonds of molgraph.
CDPL_CHEM_API void clearMatchConstraints(Atom &atom)
Removes the Chem::AtomProperty::MATCH_CONSTRAINTS property from atom.
VectorArray< Vector2D > Vector2DArray
Array storing vectors of type Math::Vector2D.
Definition: VectorArray.hpp:80
Array< std::size_t > STArray
Array storing unsigned integers of type std::size_t.
Definition: Array.hpp:575
boost::dynamic_bitset BitSet
Dynamic bitset class.
Definition: BitSet.hpp:46
The namespace of the Chemical Data Processing Library.