1import sys
2import argparse
3import distutils.util
4
5import CDPL.Chem as Chem
6
7
8# performs ChEMBL molecule standardization and parent structure extraction (optional)
9# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
10def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
11 # here, the standardization is carried out on a copy of the read input molecule
12 # (if only one molecule instance gets provided as argument, modifications will be made in-place)
13 change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
14
15 # perform parent structure extraction (optional)
16 if args.extract_parent:
17 change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED # clear excluded flag possibly set by the standardization
18 # procedure (might change after salt stripping)
19 change_flags |= chembl_proc.getParent(out_mol) # extract parent structure (in-place) and add information
20 # about the carried out modifcations
21 return change_flags
22
23def parseArgs() -> argparse.Namespace:
24 parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
25
26 parser.add_argument('-i',
27 dest='in_file',
28 required=True,
29 metavar='<file>',
30 help='Input molecule file')
31 parser.add_argument('-o',
32 dest='out_file',
33 required=True,
34 metavar='<file>',
35 help='Output molecule file')
36 parser.add_argument('-v',
37 dest='verb_level',
38 required=False,
39 metavar='<0|1|2>',
40 choices=range(0, 3),
41 default=1,
42 help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
43 type=int)
44 parser.add_argument('-p',
45 dest='extract_parent',
46 required=False,
47 metavar='<true|false>',
48 type=lambda x:bool(distutils.util.strtobool(x)),
49 default=True,
50 help='Extract parent structure (default: true)')
51 parser.add_argument('-d',
52 dest='drop_excluded',
53 required=False,
54 metavar='<true|false>',
55 type=lambda x:bool(distutils.util.strtobool(x)),
56 default=False,
57 help='Drop structures that fulfill the exclusion criterions (default: false)')
58 parser.add_argument('-x',
59 dest='proc_excluded',
60 required=False,
61 metavar='<true|false>',
62 type=lambda x:bool(distutils.util.strtobool(x)),
63 default=True,
64 help='Standardize structures that fulfill the exclusion criterions (default: true)')
65
66 return parser.parse_args()
67
68def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
69 changes = ' Carried out modifications:'
70
71 if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
72 changes += '\n * Explicit hydrogens removed'
73
74 if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
75 changes += '\n * Undefined stereocenter information standardized'
76
77 if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
78 changes += '\n * Kekule structure generated'
79
80 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
81 changes += '\n * Functional groups normalized'
82
83 if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
84 changes += '\n * Number of charged atoms reduced'
85
86 if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
87 changes += '\n * Configuration of chiral tartrate atoms set to undefined'
88
89 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
90 changes += '\n * 2D structure corrected'
91
92 if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
93 changes += '\n * Isotope information cleared'
94
95 if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
96 changes += '\n * Salt components removed'
97
98 if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
99 changes += '\n * Solvent components removed'
100
101 if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
102 changes += '\n * Duplicate components removed'
103
104 return changes
105
106def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
107 if args.verb_level == 0:
108 return None
109
110 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
111 if args.drop_excluded:
112 return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
113
114 if not args.proc_excluded:
115 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
116
117 if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
118 if args.verb_level == 2:
119 return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
120
121 return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
122
123 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
124
125 if change_flags:
126 if args.verb_level == 2:
127 return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
128
129 return ('- Molecule %s: modified' % mol_id)
130
131 return ('- Molecule %s: forwarded unchanged' % mol_id)
132
133def main() -> None:
134 args = parseArgs() # process command line arguments
135
136 # create reader for input molecules (format specified by file extension)
137 reader = Chem.MoleculeReader(args.in_file)
138
139 # create writer for output molecules (format specified by file extension)
140 writer = Chem.MolecularGraphWriter(args.out_file)
141
142 # do not update timestamp for output in an MDL format, just for testing purposes!
143 Chem.setMDLUpdateTimestampParameter(writer, False)
144
145 # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
146 in_mol = Chem.BasicMolecule()
147 out_mol = Chem.BasicMolecule()
148
149 # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
150 chembl_proc = Chem.ChEMBLStandardizer()
151 i = 1
152
153 try:
154 # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
155 while reader.read(in_mol):
156 # compose a molecule identifier
157 mol_id = Chem.getName(in_mol).strip()
158
159 if mol_id == '':
160 mol_id = '#' + str(i) # fallback if name is empty or not available
161 else:
162 mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
163
164 try:
165 # perform standardization and parent structure extraction (optional)
166 change_flags = standardize(chembl_proc, in_mol, out_mol, args)
167
168 log_msg = getLogMessage(change_flags, args, mol_id)
169
170 if log_msg:
171 print(log_msg)
172
173 # check if the excluded flag has been set and take appropriate action
174 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
175 continue
176
177 try:
178 # calculate (if not present) some basic properties of the output molecule
179 # that might be required for writing (output format dependent)
180 Chem.calcImplicitHydrogenCounts(out_mol, False)
181 Chem.perceiveHybridizationStates(out_mol, False)
182 Chem.perceiveSSSR(out_mol, False)
183 Chem.setRingFlags(out_mol, False)
184 Chem.setAromaticityFlags(out_mol, False)
185 Chem.perceiveComponents(out_mol, False)
186
187 # write output molecule
188 if not writer.write(out_mol):
189 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
190
191 except Exception as e: # handle exception raised in case of severe write errors
192 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
193
194 except Exception as e: # handle exception raised in case of severe structure processing errors
195 sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
196
197 i += 1
198
199 except Exception as e: # handle exception raised in case of severe read errors
200 sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
201
202 writer.close()
203 sys.exit(0)
204
205if __name__ == '__main__':
206 main()