Chemical Data Processing Library C++ API - Version 1.2.0
ChEMBLStandardizer.hpp
Go to the documentation of this file.
1 /*
2  * ChEMBLStandardizer.hpp
3  *
4  * Implementation of the ChEMBL molecule standardization and parent compound extraction procedure
5  * (A. P. Bento et al., An open source chemical structure curation pipeline using RDKit, J. Cheminformatics 2020, 12, 51)
6  *
7  * This file is part of the Chemical Data Processing Toolkit
8  *
9  * Copyright (C) 2003 Thomas Seidel <thomas.seidel@univie.ac.at>
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * along with this library; see the file COPYING. If not, write to
23  * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 02111-1307, USA.
25  */
26 
32 #ifndef CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
33 #define CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
34 
35 #include <vector>
36 #include <cstdint>
37 #include <utility>
38 #include <unordered_set>
39 #include <memory>
40 
41 #include <boost/functional/hash.hpp>
42 
43 #include "CDPL/Chem/APIPrefix.hpp"
45 #include "CDPL/Chem/Fragment.hpp"
50 #include "CDPL/Util/BitSet.hpp"
52 
53 
54 namespace CDPL
55 {
56 
57  namespace Chem
58  {
59 
65  {
66 
67  public:
68  typedef std::shared_ptr<ChEMBLStandardizer> SharedPointer;
69 
71  {
72 
73  NONE = 0x0,
74  EXCLUDED = 0x1,
75  EXPLICIT_HYDROGENS_REMOVED = 0x2,
76  UNKNOWN_STEREO_STANDARDIZED = 0x4,
77  BONDS_KEKULIZED = 0x8,
78  STRUCTURE_NORMALIZED = 0x10,
79  CHARGES_REMOVED = 0x20,
80  TARTRATE_STEREO_CLEARED = 0x40,
81  STRUCTURE_2D_CORRECTED = 0x80,
82  ISOTOPE_INFO_CLEARED = 0x100,
83  SALT_COMPONENTS_REMOVED = 0x200,
84  SOLVENT_COMPONENTS_REMOVED = 0x400,
85  DUPLICATE_COMPONENTS_REMOVED = 0x800
86  };
87 
89 
90  ChEMBLStandardizer(const ChEMBLStandardizer& standardizer);
91 
92  ChangeFlags standardize(Molecule& mol, bool proc_excld = false);
93 
94  ChangeFlags standardize(const Molecule& mol, Molecule& std_mol, bool proc_excluded = false);
95 
96  ChangeFlags getParent(Molecule& mol, bool neutralize = true, bool check_exclusion = true);
97 
98  ChangeFlags getParent(const Molecule& mol, Molecule& parent_mol, bool neutralize = true, bool check_exclusion = true);
99 
101 
102  private:
103  typedef std::vector<Atom*> AtomList;
104 
105  void copyMolecule(const Molecule& mol, Molecule& mol_copy) const;
106 
107  bool checkExclusionCriterions(const Molecule& mol) const;
108  bool checkExclusionCriterions(const MolecularGraph& molgraph, std::size_t& boron_cnt) const;
109 
110  bool standardizeUnknownStereochemistry(Molecule& mol) const;
111 
112  bool kekulizeBonds(Molecule& mol);
113 
114  bool removeExplicitHydrogens(Molecule& mol) const;
115  bool isRemovableHydrogen(const Atom& atom) const;
116 
117  bool normalizeStructure(Molecule& mol);
118  const Chem::Atom* getAtomWithMappingID(const Molecule& ptn, std::size_t id) const;
119 
120  bool removeCharges(Molecule& mol);
121 
122  bool removeTartrateStereochemistry(Molecule& mol);
123 
124  bool cleanup2DStructure(Molecule& mol);
125  double calc2DBondAngle(const Molecule& mol, const Atom& ctr_atom, const Atom& nbr_atom1, const Atom& nbr_atom2);
126  void rotateSubstituent(const Molecule& mol, const Atom& ctr_atom, const Atom& subst_atom, double rot_ang);
127 
128  void clearMatchConstraints(Molecule& mol) const;
129 
130  typedef std::pair<std::uint64_t, std::uint64_t> StructureID;
131  typedef std::pair<const Fragment*, StructureID> MoleculeComponent;
132  typedef std::vector<MoleculeComponent> MoleculeComponentList;
133  typedef std::unordered_set<StructureID, boost::hash<StructureID> > StructureIDSet;
134 
135  HashCodeCalculator hashCodeCalc;
136  KekuleStructureCalculator kekuleStructureCalc;
137  Util::STArray kekulizedBondOrders;
138  SubstructureSearch substructSearch;
139  ProtonationStateStandardizer chargeStandardizer;
140  Math::Vector2DArray atom2DCoords;
141  Util::BitSet markedAtomSet;
142  Fragment tmpFragment;
143  BasicMolecule tmpMolecule;
144  MoleculeComponentList molCompList1;
145  MoleculeComponentList molCompList2;
146  StructureIDSet uniqueMolComps;
147  };
148  } // namespace Chem
149 } // namespace CDPL
150 
151 #endif // CDPL_CHEM_CHEMBLSTANDARDIZER_HPP
Definition of the class CDPL::Chem::BasicMolecule.
Definition of the type CDPL::Util::BitSet.
Definition of the preprocessor macro CDPL_CHEM_API.
#define CDPL_CHEM_API
Tells the compiler/linker which classes, functions and variables are part of the library API.
Definition of the class CDPL::Chem::Fragment.
Definition of the class CDPL::Chem::HashCodeCalculator.
Definition of the class CDPL::Chem::KekuleStructureCalculator.
Definition of the class CDPL::Chem::ProtonationStateStandardizer.
Definition of the class CDPL::Chem::SubstructureSearch.
Definition of the class CDPL::Math::VectorArray.
Atom.
Definition: Atom.hpp:52
BasicMolecule.
Definition: BasicMolecule.hpp:54
Implementation of the ChEMBL structure preprocessing pipeline.
Definition: ChEMBLStandardizer.hpp:65
ChangeFlags standardize(Molecule &mol, bool proc_excld=false)
ChangeFlags getParent(const Molecule &mol, Molecule &parent_mol, bool neutralize=true, bool check_exclusion=true)
ChangeFlags
Definition: ChEMBLStandardizer.hpp:71
ChangeFlags standardize(const Molecule &mol, Molecule &std_mol, bool proc_excluded=false)
std::shared_ptr< ChEMBLStandardizer > SharedPointer
Definition: ChEMBLStandardizer.hpp:68
ChEMBLStandardizer & operator=(const ChEMBLStandardizer &standardizer)
ChEMBLStandardizer(const ChEMBLStandardizer &standardizer)
ChangeFlags getParent(Molecule &mol, bool neutralize=true, bool check_exclusion=true)
Fragment.
Definition: Fragment.hpp:52
HashCodeCalculator.
Definition: HashCodeCalculator.hpp:57
KekuleStructureCalculator.
Definition: KekuleStructureCalculator.hpp:54
MolecularGraph.
Definition: MolecularGraph.hpp:52
Molecule.
Definition: Molecule.hpp:49
Sets the protation state of molecules according to desired objectives.
Definition: ProtonationStateStandardizer.hpp:57
SubstructureSearch.
Definition: SubstructureSearch.hpp:64
constexpr unsigned int NONE
Represents an empty set of atom properties.
Definition: Biomol/AtomPropertyFlag.hpp:48
CDPL_CHEM_API void kekulizeBonds(MolecularGraph &molgraph)
CDPL_CHEM_API void clearMatchConstraints(Atom &atom)
VectorArray< Vector2D > Vector2DArray
An array of Math::Vector2D objects.
Definition: VectorArray.hpp:79
Array< std::size_t > STArray
An array of unsigned integers of type std::size_t.
Definition: Array.hpp:567
boost::dynamic_bitset BitSet
A dynamic bitset class.
Definition: BitSet.hpp:46
The namespace of the Chemical Data Processing Library.