1.1.3.1. ChEMBL Structure Curation Pipeline

  1import sys
  2import argparse
  3import distutils.util
  4
  5import CDPL.Chem as Chem
  6
  7    
  8# performs ChEMBL molecule standardization and parent structure extraction (optional)
  9# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
 10def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
 11    # here, the standardization is carried out on a copy of the read input molecule
 12    # (if only one molecule instance gets provided as argument, modifications will be made in-place)
 13    change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
 14
 15    # perform parent structure extraction (optional)
 16    if args.extract_parent:
 17        change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED  # clear excluded flag possibly set by the standardization
 18                                                           # procedure (might change after salt stripping)
 19        change_flags |= chembl_proc.getParent(out_mol)     # extract parent structure (in-place) and add information
 20                                                           # about the carried out modifcations
 21    return change_flags
 22
 23def parseArgs() -> argparse.Namespace:
 24    parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
 25
 26    parser.add_argument('-i',
 27                        dest='in_file',
 28                        required=True,
 29                        metavar='<file>',
 30                        help='Input molecule file')
 31    parser.add_argument('-o',
 32                        dest='out_file',
 33                        required=True,
 34                        metavar='<file>',
 35                        help='Output molecule file')
 36    parser.add_argument('-v',
 37                        dest='verb_level',
 38                        required=False,
 39                        metavar='<0|1|2>',
 40                        choices=range(0, 3),
 41                        default=1,
 42                        help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
 43                        type=int)
 44    parser.add_argument('-p',
 45                        dest='extract_parent',
 46                        required=False,
 47                        metavar='<true|false>',
 48                        type=lambda x:bool(distutils.util.strtobool(x)),
 49                        default=True,
 50                        help='Extract parent structure (default: true)')
 51    parser.add_argument('-d',
 52                        dest='drop_excluded',
 53                        required=False,
 54                        metavar='<true|false>',
 55                        type=lambda x:bool(distutils.util.strtobool(x)),
 56                        default=False,
 57                        help='Drop structures that fulfill the exclusion criterions (default: false)')
 58    parser.add_argument('-x',
 59                        dest='proc_excluded',
 60                        required=False,
 61                        metavar='<true|false>',
 62                        type=lambda x:bool(distutils.util.strtobool(x)),
 63                        default=True,
 64                        help='Standardize structures that fulfill the exclusion criterions (default: true)')
 65
 66    return parser.parse_args()
 67
 68def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
 69    changes = '   Carried out modifications:'
 70
 71    if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
 72        changes += '\n    * Explicit hydrogens removed'
 73
 74    if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
 75        changes += '\n    * Undefined stereocenter information standardized'
 76        
 77    if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
 78        changes += '\n    * Kekule structure generated'
 79
 80    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
 81        changes += '\n    * Functional groups normalized'
 82        
 83    if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
 84        changes += '\n    * Number of charged atoms reduced'
 85        
 86    if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
 87        changes += '\n    * Configuration of chiral tartrate atoms set to undefined'
 88                
 89    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
 90        changes += '\n    * 2D structure corrected'
 91
 92    if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
 93        changes += '\n    * Isotope information cleared'
 94
 95    if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
 96        changes += '\n    * Salt components removed'
 97
 98    if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
 99        changes += '\n    * Solvent components removed'
100
101    if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
102        changes += '\n    * Duplicate components removed'
103        
104    return changes
105    
106def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
107    if args.verb_level == 0:
108        return None
109    
110    if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
111        if args.drop_excluded:
112            return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
113
114        if not args.proc_excluded:
115            return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
116
117        if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
118            if args.verb_level == 2:
119                return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
120            
121            return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
122
123        return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
124
125    if change_flags:
126        if args.verb_level == 2:
127            return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
128                        
129        return ('- Molecule %s: modified' % mol_id)
130            
131    return ('- Molecule %s: forwarded unchanged' % mol_id)
132
133def main() -> None:
134    args = parseArgs() # process command line arguments
135
136    # create reader for input molecules (format specified by file extension)
137    reader = Chem.MoleculeReader(args.in_file) 
138
139    # create writer for output molecules (format specified by file extension)
140    writer = Chem.MolecularGraphWriter(args.out_file) 
141
142    # do not update timestamp for output in an MDL format, just for testing purposes!
143    Chem.setMDLUpdateTimestampParameter(writer, False)
144    
145    # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
146    in_mol = Chem.BasicMolecule()
147    out_mol = Chem.BasicMolecule()
148
149    # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
150    chembl_proc = Chem.ChEMBLStandardizer()
151    i = 1
152    
153    try:
154        # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
155        while reader.read(in_mol):
156            # compose a molecule identifier
157            mol_id = Chem.getName(in_mol).strip() 
158
159            if mol_id == '':
160                mol_id = '#' + str(i)  # fallback if name is empty or not available
161            else:
162                mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
163         
164            try:
165                # perform standardization and parent structure extraction (optional)
166                change_flags = standardize(chembl_proc, in_mol, out_mol, args) 
167
168                log_msg = getLogMessage(change_flags, args, mol_id)
169
170                if log_msg:
171                    print(log_msg)
172                
173                # check if the excluded flag has been set and take appropriate action
174                if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
175                    continue
176                
177                try:
178                    # calculate (if not present) some basic properties of the output molecule
179                    # that might be required for writing (output format dependent)
180                    Chem.calcImplicitHydrogenCounts(out_mol, False)
181                    Chem.perceiveHybridizationStates(out_mol, False)
182                    Chem.perceiveSSSR(out_mol, False)
183                    Chem.setRingFlags(out_mol, False)
184                    Chem.setAromaticityFlags(out_mol, False)
185                    Chem.perceiveComponents(out_mol, False)
186                              
187                    # write output molecule
188                    if not writer.write(out_mol):
189                        sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
190
191                except Exception as e: # handle exception raised in case of severe write errors
192                    sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
193                
194            except Exception as e: # handle exception raised in case of severe structure processing errors
195                sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
196
197            i += 1
198            
199    except Exception as e: # handle exception raised in case of severe read errors
200        sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
201
202    writer.close()
203    sys.exit(0)
204        
205if __name__ == '__main__':
206    main()

Download source file