1.1.3.1. ChEMBL structure curation pipeline

  1import sys
  2import os
  3import argparse
  4import distutils.util
  5
  6import CDPL.Chem as Chem
  7
  8    
  9# performs ChEMBL molecule standardization and parent structure extraction (optional)
 10# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
 11def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
 12    # here, the standardization is carried out on a copy of the read input molecule
 13    # (if only one molecule instance gets provided as argument, modifications will be made in-place)
 14    change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
 15
 16    # perform parent structure extraction (optional)
 17    if args.extract_parent:
 18        change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED  # clear excluded flag possibly set by the standardization
 19                                                           # procedure (might change after salt stripping)
 20        change_flags |= chembl_proc.getParent(out_mol)     # extract parent structure (in-place) and add information
 21                                                           # about the carried out modifcations
 22    return change_flags
 23
 24def parseArgs() -> argparse.Namespace:
 25    parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
 26
 27    parser.add_argument('-i',
 28                        dest='in_file',
 29                        required=True,
 30                        metavar='<file>',
 31                        help='Input molecule file')
 32    parser.add_argument('-o',
 33                        dest='out_file',
 34                        required=True,
 35                        metavar='<file>',
 36                        help='Output molecule file')
 37    parser.add_argument('-v',
 38                        dest='verb_level',
 39                        required=False,
 40                        metavar='<0|1|2>',
 41                        choices=range(0, 3),
 42                        default=1,
 43                        help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
 44                        type=int)
 45    parser.add_argument('-p',
 46                        dest='extract_parent',
 47                        required=False,
 48                        metavar='<true|false>',
 49                        type=lambda x:bool(distutils.util.strtobool(x)),
 50                        default=True,
 51                        help='Extract parent structure (default: true)')
 52    parser.add_argument('-d',
 53                        dest='drop_excluded',
 54                        required=False,
 55                        metavar='<true|false>',
 56                        type=lambda x:bool(distutils.util.strtobool(x)),
 57                        default=False,
 58                        help='Drop structures that fulfill the exclusion criterions (default: false)')
 59    parser.add_argument('-x',
 60                        dest='proc_excluded',
 61                        required=False,
 62                        metavar='<true|false>',
 63                        type=lambda x:bool(distutils.util.strtobool(x)),
 64                        default=True,
 65                        help='Standardize structures that fulfill the exclusion criterions (default: true)')
 66
 67    return parser.parse_args()
 68
 69def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
 70    changes = '   Carried out modifications:'
 71
 72    if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
 73        changes += '\n    * Explicit hydrogens removed'
 74
 75    if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
 76        changes += '\n    * Undefined stereocenter information standardized'
 77        
 78    if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
 79        changes += '\n    * Kekule structure generated'
 80
 81    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
 82        changes += '\n    * Functional groups normalized'
 83        
 84    if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
 85        changes += '\n    * Number of charged atoms reduced'
 86        
 87    if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
 88        changes += '\n    * Configuration of chiral tartrate atoms set to undefined'
 89                
 90    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
 91        changes += '\n    * 2D structure corrected'
 92
 93    if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
 94        changes += '\n    * Isotope information cleared'
 95
 96    if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
 97        changes += '\n    * Salt components removed'
 98
 99    if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
100        changes += '\n    * Solvent components removed'
101
102    if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
103        changes += '\n    * Duplicate components removed'
104        
105    return changes
106    
107def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
108    if args.verb_level == 0:
109        return None
110    
111    if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
112        if args.drop_excluded:
113            return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
114
115        if not args.proc_excluded:
116            return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
117
118        if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
119            if args.verb_level == 2:
120                return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
121            
122            return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
123
124        return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
125
126    if change_flags:
127        if args.verb_level == 2:
128            return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
129                        
130        return ('- Molecule %s: modified' % mol_id)
131            
132    return ('- Molecule %s: forwarded unchanged' % mol_id)
133
134def main() -> None:
135    args = parseArgs() # process command line arguments
136
137    # create reader for input molecules (format specified by file extension)
138    reader = Chem.MoleculeReader(args.in_file) 
139
140    # create writer for output molecules (format specified by file extension)
141    writer = Chem.MolecularGraphWriter(args.out_file) 
142
143    # do not update timestamp for output in an MDL format, just for testing purposes!
144    Chem.setMDLUpdateTimestampParameter(writer, False)
145    
146    # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
147    in_mol = Chem.BasicMolecule()
148    out_mol = Chem.BasicMolecule()
149
150    # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
151    chembl_proc = Chem.ChEMBLStandardizer()
152    i = 1
153    
154    try:
155        # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
156        while reader.read(in_mol):
157            # compose a molecule identifier
158            mol_id = Chem.getName(in_mol).strip() 
159
160            if mol_id == '':
161                mol_id = '#' + str(i)  # fallback if name is empty or not available
162            else:
163                mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
164         
165            try:
166                # perform standardization and parent structure extraction (optional)
167                change_flags = standardize(chembl_proc, in_mol, out_mol, args) 
168
169                log_msg = getLogMessage(change_flags, args, mol_id)
170
171                if log_msg:
172                    print(log_msg)
173                
174                # check if the excluded flag has been set and take appropriate action
175                if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
176                    continue
177                
178                try:
179                    # calculate (if not present) some basic properties of the output molecule
180                    # that might be required for writing (output format dependent)
181                    Chem.calcImplicitHydrogenCounts(out_mol, False)
182                    Chem.perceiveHybridizationStates(out_mol, False)
183                    Chem.perceiveSSSR(out_mol, False)
184                    Chem.setRingFlags(out_mol, False)
185                    Chem.setAromaticityFlags(out_mol, False)
186                    Chem.perceiveComponents(out_mol, False)
187                              
188                    # write output molecule
189                    if not writer.write(out_mol):
190                        sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
191
192                except Exception as e: # handle exception raised in case of severe write errors
193                    sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
194                
195            except Exception as e: # handle exception raised in case of severe structure processing errors
196                sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
197
198            i += 1
199            
200    except Exception as e: # handle exception raised in case of severe read errors
201        sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
202
203    writer.close()
204    sys.exit(0)
205        
206if __name__ == '__main__':
207    main()

Download source file