1import sys
2import argparse
3
4import CDPL.Chem as Chem
5
6
7# performs ChEMBL molecule standardization and parent structure extraction (optional)
8# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
9def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
10 # here, the standardization is carried out on a copy of the read input molecule
11 # (if only one molecule instance gets provided as argument, modifications will be made in-place)
12 change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
13
14 # perform parent structure extraction (optional)
15 if args.extract_parent:
16 change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED # clear excluded flag possibly set by the standardization
17 # procedure (might change after salt stripping)
18 change_flags |= chembl_proc.getParent(out_mol) # extract parent structure (in-place) and add information
19 # about the carried out modifcations
20 return change_flags
21
22def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
23 changes = ' Carried out modifications:'
24
25 if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
26 changes += '\n * Explicit hydrogens removed'
27
28 if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
29 changes += '\n * Undefined stereocenter information standardized'
30
31 if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
32 changes += '\n * Kekule structure generated'
33
34 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
35 changes += '\n * Functional groups normalized'
36
37 if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
38 changes += '\n * Number of charged atoms reduced'
39
40 if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
41 changes += '\n * Configuration of chiral tartrate atoms set to undefined'
42
43 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
44 changes += '\n * 2D structure corrected'
45
46 if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
47 changes += '\n * Isotope information cleared'
48
49 if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
50 changes += '\n * Salt components removed'
51
52 if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
53 changes += '\n * Solvent components removed'
54
55 if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
56 changes += '\n * Duplicate components removed'
57
58 return changes
59
60def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
61 if args.verb_level == 0:
62 return None
63
64 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
65 if args.drop_excluded:
66 return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
67
68 if not args.proc_excluded:
69 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
70
71 if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
72 if args.verb_level == 2:
73 return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
74
75 return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
76
77 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
78
79 if change_flags:
80 if args.verb_level == 2:
81 return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
82
83 return ('- Molecule %s: modified' % mol_id)
84
85 return ('- Molecule %s: forwarded unchanged' % mol_id)
86
87def parseArgs() -> argparse.Namespace:
88 def strtobool(value: str) -> bool:
89 value = value.lower()
90 if value in ("y", "yes", "on", "1", "true", "t"):
91 return True
92 return False
93
94 parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
95
96 parser.add_argument('-i',
97 dest='in_file',
98 required=True,
99 metavar='<file>',
100 help='Input molecule file')
101 parser.add_argument('-o',
102 dest='out_file',
103 required=True,
104 metavar='<file>',
105 help='Output molecule file')
106 parser.add_argument('-v',
107 dest='verb_level',
108 required=False,
109 metavar='<0|1|2>',
110 choices=range(0, 3),
111 default=1,
112 help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
113 type=int)
114 parser.add_argument('-p',
115 dest='extract_parent',
116 required=False,
117 metavar='<true|false>',
118 type=lambda x:bool(strtobool(x)),
119 default=True,
120 help='Extract parent structure (default: true)')
121 parser.add_argument('-d',
122 dest='drop_excluded',
123 required=False,
124 action='store_true',
125 default=False,
126 help='Drop structures that fulfill the exclusion criterions (default: false)')
127 parser.add_argument('-x',
128 dest='proc_excluded',
129 required=False,
130 metavar='<true|false>',
131 type=lambda x:bool(strtobool(x)),
132 default=True,
133 help='Standardize structures that fulfill the exclusion criterions (default: true)')
134
135 return parser.parse_args()
136
137def main() -> None:
138 args = parseArgs() # process command line arguments
139
140 # create reader for input molecules (format specified by file extension)
141 reader = Chem.MoleculeReader(args.in_file)
142
143 # create writer for output molecules (format specified by file extension)
144 writer = Chem.MolecularGraphWriter(args.out_file)
145
146 # do not update timestamp for output in an MDL format, just for testing purposes!
147 Chem.setMDLUpdateTimestampParameter(writer, False)
148
149 # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
150 in_mol = Chem.BasicMolecule()
151 out_mol = Chem.BasicMolecule()
152
153 # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
154 chembl_proc = Chem.ChEMBLStandardizer()
155 i = 1
156
157 try:
158 # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
159 while reader.read(in_mol):
160 # compose a molecule identifier
161 mol_id = Chem.getName(in_mol).strip()
162
163 if mol_id == '':
164 mol_id = '#' + str(i) # fallback if name is empty or not available
165 else:
166 mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
167
168 try:
169 # perform standardization and parent structure extraction (optional)
170 change_flags = standardize(chembl_proc, in_mol, out_mol, args)
171
172 log_msg = getLogMessage(change_flags, args, mol_id)
173
174 if log_msg:
175 print(log_msg)
176
177 # check if the excluded flag has been set and take appropriate action
178 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
179 continue
180
181 try:
182 # calculate (if not present) some basic properties of the output molecule
183 # that might be required for writing (output format dependent)
184 Chem.calcImplicitHydrogenCounts(out_mol, False)
185 Chem.perceiveHybridizationStates(out_mol, False)
186 Chem.perceiveSSSR(out_mol, False)
187 Chem.setRingFlags(out_mol, False)
188 Chem.setAromaticityFlags(out_mol, False)
189 Chem.perceiveComponents(out_mol, False)
190
191 # write output molecule
192 if not writer.write(out_mol):
193 sys.exit('Error: writing molecule %s failed' % mol_id)
194
195 except Exception as e: # handle exception raised in case of severe write errors
196 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
197
198 except Exception as e: # handle exception raised in case of severe structure processing errors
199 sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
200
201 i += 1
202
203 except Exception as e: # handle exception raised in case of severe read errors
204 sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
205
206 writer.close()
207 sys.exit(0)
208
209if __name__ == '__main__':
210 main()