1.1.3.1. ChEMBL Structure Curation Pipeline
The script chembl_preproc.py Performs molecule standardization according to the ChEMBL structure curation pipeline workflow [15].
Synopsis
python chembl_preproc.py [-h] -i <file> -o <file> [-v <0|1|2>] [-p <true|false>] [-d] [-x <true|false>]
Mandatory options
- -i <file>
Input molecule file
- -o <file>
Output molecule file
Other options
- -h, --help
Show help message and exit
- -i <file>
Input molecule file
- -o <file>
Output molecule file
- -v <0|1|2>
Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)
- -p <true|false>
Extract parent structure (default: true)
- -d
Drop structures that fulfill the exclusion criterions (default: false)
- -x <true|false>
Standardize structures that fulfill the exclusion criterions (default: true)
Code
1import sys
2import argparse
3
4import CDPL.Chem as Chem
5
6
7# performs ChEMBL molecule standardization and parent structure extraction (optional)
8# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
9def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
10 # here, the standardization is carried out on a copy of the read input molecule
11 # (if only one molecule instance gets provided as argument, modifications will be made in-place)
12 change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
13
14 # perform parent structure extraction (optional)
15 if args.extract_parent:
16 change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED # clear excluded flag possibly set by the standardization
17 # procedure (might change after salt stripping)
18 change_flags |= chembl_proc.getParent(out_mol) # extract parent structure (in-place) and add information
19 # about the carried out modifcations
20 return change_flags
21
22def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
23 changes = ' Carried out modifications:'
24
25 if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
26 changes += '\n * Explicit hydrogens removed'
27
28 if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
29 changes += '\n * Undefined stereocenter information standardized'
30
31 if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
32 changes += '\n * Kekule structure generated'
33
34 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
35 changes += '\n * Functional groups normalized'
36
37 if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
38 changes += '\n * Number of charged atoms reduced'
39
40 if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
41 changes += '\n * Configuration of chiral tartrate atoms set to undefined'
42
43 if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
44 changes += '\n * 2D structure corrected'
45
46 if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
47 changes += '\n * Isotope information cleared'
48
49 if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
50 changes += '\n * Salt components removed'
51
52 if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
53 changes += '\n * Solvent components removed'
54
55 if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
56 changes += '\n * Duplicate components removed'
57
58 return changes
59
60def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
61 if args.verb_level == 0:
62 return None
63
64 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
65 if args.drop_excluded:
66 return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
67
68 if not args.proc_excluded:
69 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
70
71 if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
72 if args.verb_level == 2:
73 return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
74
75 return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
76
77 return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
78
79 if change_flags:
80 if args.verb_level == 2:
81 return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
82
83 return ('- Molecule %s: modified' % mol_id)
84
85 return ('- Molecule %s: forwarded unchanged' % mol_id)
86
87def parseArgs() -> argparse.Namespace:
88 def strtobool(value: str) -> bool:
89 value = value.lower()
90
91 if value in ("y", "yes", "on", "1", "true", "t"):
92 return True
93
94 return False
95
96 parser = argparse.ArgumentParser(description='Performs molecule standardization according to the ChEMBL structure curation pipeline workflow.')
97
98 parser.add_argument('-i',
99 dest='in_file',
100 required=True,
101 metavar='<file>',
102 help='Input molecule file')
103 parser.add_argument('-o',
104 dest='out_file',
105 required=True,
106 metavar='<file>',
107 help='Output molecule file')
108 parser.add_argument('-v',
109 dest='verb_level',
110 required=False,
111 metavar='<0|1|2>',
112 choices=range(0, 3),
113 default=1,
114 help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
115 type=int)
116 parser.add_argument('-p',
117 dest='extract_parent',
118 required=False,
119 metavar='<true|false>',
120 type=lambda x:bool(strtobool(x)),
121 default=True,
122 help='Extract parent structure (default: true)')
123 parser.add_argument('-d',
124 dest='drop_excluded',
125 required=False,
126 action='store_true',
127 default=False,
128 help='Drop structures that fulfill the exclusion criterions (default: false)')
129 parser.add_argument('-x',
130 dest='proc_excluded',
131 required=False,
132 metavar='<true|false>',
133 type=lambda x:bool(strtobool(x)),
134 default=True,
135 help='Standardize structures that fulfill the exclusion criterions (default: true)')
136
137 return parser.parse_args()
138
139def main() -> None:
140 args = parseArgs() # process command line arguments
141
142 # create reader for input molecules (format specified by file extension)
143 reader = Chem.MoleculeReader(args.in_file)
144
145 # create writer for output molecules (format specified by file extension)
146 writer = Chem.MolecularGraphWriter(args.out_file)
147
148 # do not update timestamp for output in an MDL format, just for testing purposes!
149 Chem.setMDLUpdateTimestampParameter(writer, False)
150
151 # write canonical SMILES
152 Chem.setSMILESOutputCanonicalFormParameter(writer, True)
153
154 # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
155 in_mol = Chem.BasicMolecule()
156 out_mol = Chem.BasicMolecule()
157
158 # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
159 chembl_proc = Chem.ChEMBLStandardizer()
160 i = 1
161
162 try:
163 # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
164 while reader.read(in_mol):
165 # compose a molecule identifier
166 mol_id = Chem.getName(in_mol).strip()
167
168 if mol_id == '':
169 mol_id = '#' + str(i) # fallback if name is empty or not available
170 else:
171 mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
172
173 try:
174 # perform standardization and parent structure extraction (optional)
175 change_flags = standardize(chembl_proc, in_mol, out_mol, args)
176
177 log_msg = getLogMessage(change_flags, args, mol_id)
178
179 if log_msg:
180 print(log_msg)
181
182 # check if the excluded flag has been set and take appropriate action
183 if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
184 continue
185
186 try:
187 # calculate (if not present) some basic properties of the output molecule
188 # that might be required for writing (output format dependent)
189 Chem.calcImplicitHydrogenCounts(out_mol, False)
190 Chem.perceiveHybridizationStates(out_mol, False)
191 Chem.perceiveSSSR(out_mol, False)
192 Chem.setRingFlags(out_mol, False)
193 Chem.setAromaticityFlags(out_mol, False)
194 Chem.perceiveComponents(out_mol, False)
195
196 # write output molecule
197 if not writer.write(out_mol):
198 sys.exit('Error: writing molecule %s failed' % mol_id)
199
200 except Exception as e: # handle exception raised in case of severe write errors
201 sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
202
203 except Exception as e: # handle exception raised in case of severe structure processing errors
204 sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
205
206 i += 1
207
208 except Exception as e: # handle exception raised in case of severe read errors
209 sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
210
211 writer.close()
212 sys.exit(0)
213
214if __name__ == '__main__':
215 main()