1.1.3.1. ChEMBL Structure Curation Pipeline

The script chembl_preproc.py Performs molecule standardization according to the ChEMBL structure curation pipeline workflow [15].

Synopsis

python chembl_preproc.py [-h] -i <file> -o <file> [-v <0|1|2>] [-p <true|false>] [-d] [-x <true|false>]

Mandatory options

-i <file>

Input molecule file

-o <file>

Output molecule file

Other options

-h, --help

Show help message and exit

-i <file>

Input molecule file

-o <file>

Output molecule file

-v <0|1|2>

Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)

-p <true|false>

Extract parent structure (default: true)

-d

Drop structures that fulfill the exclusion criterions (default: false)

-x <true|false>

Standardize structures that fulfill the exclusion criterions (default: true)

Code

  1import sys
  2import argparse
  3
  4import CDPL.Chem as Chem
  5
  6    
  7# performs ChEMBL molecule standardization and parent structure extraction (optional)
  8# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
  9def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
 10    # here, the standardization is carried out on a copy of the read input molecule
 11    # (if only one molecule instance gets provided as argument, modifications will be made in-place)
 12    change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
 13
 14    # perform parent structure extraction (optional)
 15    if args.extract_parent:
 16        change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED  # clear excluded flag possibly set by the standardization
 17                                                           # procedure (might change after salt stripping)
 18        change_flags |= chembl_proc.getParent(out_mol)     # extract parent structure (in-place) and add information
 19                                                           # about the carried out modifcations
 20    return change_flags
 21
 22def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
 23    changes = '   Carried out modifications:'
 24
 25    if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
 26        changes += '\n    * Explicit hydrogens removed'
 27
 28    if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
 29        changes += '\n    * Undefined stereocenter information standardized'
 30        
 31    if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
 32        changes += '\n    * Kekule structure generated'
 33
 34    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
 35        changes += '\n    * Functional groups normalized'
 36        
 37    if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
 38        changes += '\n    * Number of charged atoms reduced'
 39        
 40    if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
 41        changes += '\n    * Configuration of chiral tartrate atoms set to undefined'
 42                
 43    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
 44        changes += '\n    * 2D structure corrected'
 45
 46    if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
 47        changes += '\n    * Isotope information cleared'
 48
 49    if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
 50        changes += '\n    * Salt components removed'
 51
 52    if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
 53        changes += '\n    * Solvent components removed'
 54
 55    if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
 56        changes += '\n    * Duplicate components removed'
 57        
 58    return changes
 59    
 60def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
 61    if args.verb_level == 0:
 62        return None
 63    
 64    if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
 65        if args.drop_excluded:
 66            return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
 67
 68        if not args.proc_excluded:
 69            return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
 70
 71        if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
 72            if args.verb_level == 2:
 73                return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
 74            
 75            return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
 76
 77        return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
 78
 79    if change_flags:
 80        if args.verb_level == 2:
 81            return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
 82                        
 83        return ('- Molecule %s: modified' % mol_id)
 84            
 85    return ('- Molecule %s: forwarded unchanged' % mol_id)
 86
 87def parseArgs() -> argparse.Namespace:
 88    def strtobool(value: str) -> bool:
 89        value = value.lower()
 90        
 91        if value in ("y", "yes", "on", "1", "true", "t"):
 92            return True
 93        
 94        return False
 95    
 96    parser = argparse.ArgumentParser(description='Performs molecule standardization according to the ChEMBL structure curation pipeline workflow.')
 97
 98    parser.add_argument('-i',
 99                        dest='in_file',
100                        required=True,
101                        metavar='<file>',
102                        help='Input molecule file')
103    parser.add_argument('-o',
104                        dest='out_file',
105                        required=True,
106                        metavar='<file>',
107                        help='Output molecule file')
108    parser.add_argument('-v',
109                        dest='verb_level',
110                        required=False,
111                        metavar='<0|1|2>',
112                        choices=range(0, 3),
113                        default=1,
114                        help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
115                        type=int)
116    parser.add_argument('-p',
117                        dest='extract_parent',
118                        required=False,
119                        metavar='<true|false>',
120                        type=lambda x:bool(strtobool(x)),
121                        default=True,
122                        help='Extract parent structure (default: true)')
123    parser.add_argument('-d',
124                        dest='drop_excluded',
125                        required=False,
126                        action='store_true',
127                        default=False,
128                        help='Drop structures that fulfill the exclusion criterions (default: false)')
129    parser.add_argument('-x',
130                        dest='proc_excluded',
131                        required=False,
132                        metavar='<true|false>',
133                        type=lambda x:bool(strtobool(x)),
134                        default=True,
135                        help='Standardize structures that fulfill the exclusion criterions (default: true)')
136
137    return parser.parse_args()
138
139def main() -> None:
140    args = parseArgs() # process command line arguments
141
142    # create reader for input molecules (format specified by file extension)
143    reader = Chem.MoleculeReader(args.in_file) 
144
145    # create writer for output molecules (format specified by file extension)
146    writer = Chem.MolecularGraphWriter(args.out_file) 
147
148    # do not update timestamp for output in an MDL format, just for testing purposes!
149    Chem.setMDLUpdateTimestampParameter(writer, False)
150
151    # write canonical SMILES
152    Chem.setSMILESOutputCanonicalFormParameter(writer, True)
153    
154    # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
155    in_mol = Chem.BasicMolecule()
156    out_mol = Chem.BasicMolecule()
157
158    # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
159    chembl_proc = Chem.ChEMBLStandardizer()
160    i = 1
161    
162    try:
163        # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
164        while reader.read(in_mol):
165            # compose a molecule identifier
166            mol_id = Chem.getName(in_mol).strip() 
167
168            if mol_id == '':
169                mol_id = '#' + str(i)  # fallback if name is empty or not available
170            else:
171                mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
172         
173            try:
174                # perform standardization and parent structure extraction (optional)
175                change_flags = standardize(chembl_proc, in_mol, out_mol, args) 
176
177                log_msg = getLogMessage(change_flags, args, mol_id)
178
179                if log_msg:
180                    print(log_msg)
181                
182                # check if the excluded flag has been set and take appropriate action
183                if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
184                    continue
185                
186                try:
187                    # calculate (if not present) some basic properties of the output molecule
188                    # that might be required for writing (output format dependent)
189                    Chem.calcImplicitHydrogenCounts(out_mol, False)
190                    Chem.perceiveHybridizationStates(out_mol, False)
191                    Chem.perceiveSSSR(out_mol, False)
192                    Chem.setRingFlags(out_mol, False)
193                    Chem.setAromaticityFlags(out_mol, False)
194                    Chem.perceiveComponents(out_mol, False)
195                              
196                    # write output molecule
197                    if not writer.write(out_mol):
198                        sys.exit('Error: writing molecule %s failed' % mol_id)
199
200                except Exception as e: # handle exception raised in case of severe write errors
201                    sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
202                
203            except Exception as e: # handle exception raised in case of severe structure processing errors
204                sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
205
206            i += 1
207            
208    except Exception as e: # handle exception raised in case of severe read errors
209        sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
210
211    writer.close()
212    sys.exit(0)
213        
214if __name__ == '__main__':
215    main()

Download source file