1import sys
2import os
3import argparse
4import distutils.util
5
6import CDPL.Chem as Chem
7
8
9# performs ChEMBL molecule standardization and parent structure extraction (optional)
10# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
11def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
12 # here, the standardization is carried out on a copy of the read input molecule
13 # (if only one molecule instance gets provided as argument, modifications will be made in-place)
14 change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
15
16 # perform parent structure extraction (optional)
17 if args.extract_parent:
18 change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED # clear excluded flag possibly set by the standardization
19 # procedure (might change after salt stripping)
20 change_flags |= chembl_proc.getParent(out_mol) # extract parent structure (in-place) and add information
21 # about the carried out modifcations
22 return change_flags
23
24def parseArgs() -> argparse.Namespace:
25 parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
26
27 parser.add_argument('-i',
28 dest='in_file',
29 required=True,
30 metavar='<file>',
31 help='Input molecule file')
32 parser.add_argument('-o',
33 dest='out_file',
34 required=True,
35 metavar='<file>',
36 help='Output molecule file')
37 parser.add_argument('-v',
38 dest='verb_level',
39 required=False,
40 metavar='<0|1|2>',
41 choices=range(0, 3),
42 default=1,
43 help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
44 type=int)
45 parser.add_argument('-p',
46 dest='extract_parent',
47 required=False,
48 metavar='<true|false>',
49 type=lambda x:bool(distutils.util.strtobool(x)),
50 default=True,
51 help='Extract parent structure (default: true)')
52 parser.add_argument('-d',
53 dest='drop_excluded',
54 required=False,
55 metavar='<true|false>',
56 type=lambda x:bool(distutils.util.strtobool(x)),
57 default=False,
58 help='Drop structures that fulfill the exclusion criterions (default: false)')
59 parser.add_argument('-x',
60 dest='proc_excluded',
61 required=False,
62 metavar='<true|false>',
63 type=lambda x:bool(distutils.util.strtobool(x)),
64 default=True,
65 help='Standardize structures that fulfill the exclusion criterions (default: true)')
66
67 return parser.parse_args()
68
69def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
70 changes = ' Carried out modifications:'
71
72 if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
73 changes += '\n * Explicit hydrogens removed'
74
75 if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
76 changes += '\n * Undefined stereocenter information standardized'
77
78 if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
79 changes += '\n * Kekule structure generated'
80
81 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
82 changes += '\n * Functional groups normalized'
83
84 if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
85 changes += '\n * Number of charged atoms reduced'
86
87 if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
88 changes += '\n * Configuration of chiral tartrate atoms set to undefined'
89
90 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
91 changes += '\n * 2D structure corrected'
92
93 if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
94 changes += '\n * Isotope information cleared'
95
96 if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
97 changes += '\n * Salt components removed'
98
99 if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
100 changes += '\n * Solvent components removed'
101
102 if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
103 changes += '\n * Duplicate components removed'
104
105 return changes
106
107def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
108 if args.verb_level == 0:
109 return None
110
111 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
112 if args.drop_excluded:
113 return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
114
115 if not args.proc_excluded:
116 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
117
118 if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
119 if args.verb_level == 2:
120 return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
121
122 return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
123
124 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
125
126 if change_flags:
127 if args.verb_level == 2:
128 return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
129
130 return ('- Molecule %s: modified' % mol_id)
131
132 return ('- Molecule %s: forwarded unchanged' % mol_id)
133
134def main() -> None:
135 args = parseArgs() # process command line arguments
136
137 # create reader for input molecules (format specified by file extension)
138 reader = Chem.MoleculeReader(args.in_file)
139
140 # create writer for output molecules (format specified by file extension)
141 writer = Chem.MolecularGraphWriter(args.out_file)
142
143 # do not update timestamp for output in an MDL format, just for testing purposes!
144 Chem.setMDLUpdateTimestampParameter(writer, False)
145
146 # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
147 in_mol = Chem.BasicMolecule()
148 out_mol = Chem.BasicMolecule()
149
150 # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
151 chembl_proc = Chem.ChEMBLStandardizer()
152 i = 1
153
154 try:
155 # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
156 while reader.read(in_mol):
157 # compose a molecule identifier
158 mol_id = Chem.getName(in_mol).strip()
159
160 if mol_id == '':
161 mol_id = '#' + str(i) # fallback if name is empty or not available
162 else:
163 mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
164
165 try:
166 # perform standardization and parent structure extraction (optional)
167 change_flags = standardize(chembl_proc, in_mol, out_mol, args)
168
169 log_msg = getLogMessage(change_flags, args, mol_id)
170
171 if log_msg:
172 print(log_msg)
173
174 # check if the excluded flag has been set and take appropriate action
175 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
176 continue
177
178 try:
179 # calculate (if not present) some basic properties of the output molecule
180 # that might be required for writing (output format dependent)
181 Chem.calcImplicitHydrogenCounts(out_mol, False)
182 Chem.perceiveHybridizationStates(out_mol, False)
183 Chem.perceiveSSSR(out_mol, False)
184 Chem.setRingFlags(out_mol, False)
185 Chem.setAromaticityFlags(out_mol, False)
186 Chem.perceiveComponents(out_mol, False)
187
188 # write output molecule
189 if not writer.write(out_mol):
190 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
191
192 except Exception as e: # handle exception raised in case of severe write errors
193 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
194
195 except Exception as e: # handle exception raised in case of severe structure processing errors
196 sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
197
198 i += 1
199
200 except Exception as e: # handle exception raised in case of severe read errors
201 sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
202
203 writer.close()
204 sys.exit(0)
205
206if __name__ == '__main__':
207 main()