1.1.3.1. ChEMBL Structure Curation Pipeline

  1import sys
  2import argparse
  3
  4import CDPL.Chem as Chem
  5
  6    
  7# performs ChEMBL molecule standardization and parent structure extraction (optional)
  8# for a given input molecule using a provided Chem.ChEMBLStandardizer instance
  9def standardize(chembl_proc: Chem.ChEMBLStandardizer, in_mol: Chem.Molecule, out_mol: Chem.Molecule, args: argparse.Namespace) -> Chem.ChEMBLStandardizer.ChangeFlags:
 10    # here, the standardization is carried out on a copy of the read input molecule
 11    # (if only one molecule instance gets provided as argument, modifications will be made in-place)
 12    change_flags = chembl_proc.standardize(in_mol, out_mol, args.proc_excluded)
 13
 14    # perform parent structure extraction (optional)
 15    if args.extract_parent:
 16        change_flags &= ~Chem.ChEMBLStandardizer.EXCLUDED  # clear excluded flag possibly set by the standardization
 17                                                           # procedure (might change after salt stripping)
 18        change_flags |= chembl_proc.getParent(out_mol)     # extract parent structure (in-place) and add information
 19                                                           # about the carried out modifcations
 20    return change_flags
 21
 22def getListOfChangesString(change_flags: Chem.ChEMBLStandardizer.ChangeFlags) -> str:
 23    changes = '   Carried out modifications:'
 24
 25    if (change_flags & Chem.ChEMBLStandardizer.EXPLICIT_HYDROGENS_REMOVED):
 26        changes += '\n    * Explicit hydrogens removed'
 27
 28    if (change_flags & Chem.ChEMBLStandardizer.UNKNOWN_STEREO_STANDARDIZED):
 29        changes += '\n    * Undefined stereocenter information standardized'
 30        
 31    if (change_flags & Chem.ChEMBLStandardizer.BONDS_KEKULIZED):
 32        changes += '\n    * Kekule structure generated'
 33
 34    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_NORMALIZED):
 35        changes += '\n    * Functional groups normalized'
 36        
 37    if (change_flags & Chem.ChEMBLStandardizer.CHARGES_REMOVED):
 38        changes += '\n    * Number of charged atoms reduced'
 39        
 40    if (change_flags & Chem.ChEMBLStandardizer.TARTRATE_STEREO_CLEARED):
 41        changes += '\n    * Configuration of chiral tartrate atoms set to undefined'
 42                
 43    if (change_flags & Chem.ChEMBLStandardizer.STRUCTURE_2D_CORRECTED):
 44        changes += '\n    * 2D structure corrected'
 45
 46    if (change_flags & Chem.ChEMBLStandardizer.ISOTOPE_INFO_CLEARED):
 47        changes += '\n    * Isotope information cleared'
 48
 49    if (change_flags & Chem.ChEMBLStandardizer.SALT_COMPONENTS_REMOVED):
 50        changes += '\n    * Salt components removed'
 51
 52    if (change_flags & Chem.ChEMBLStandardizer.SOLVENT_COMPONENTS_REMOVED):
 53        changes += '\n    * Solvent components removed'
 54
 55    if (change_flags & Chem.ChEMBLStandardizer.DUPLICATE_COMPONENTS_REMOVED):
 56        changes += '\n    * Duplicate components removed'
 57        
 58    return changes
 59    
 60def getLogMessage(change_flags: Chem.ChEMBLStandardizer.ChangeFlags, args: argparse.Namespace, mol_id: str) -> str:
 61    if args.verb_level == 0:
 62        return None
 63    
 64    if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED):
 65        if args.drop_excluded:
 66            return ('- Molecule %s: discarded (flagged as excluded)' % mol_id)
 67
 68        if not args.proc_excluded:
 69            return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
 70
 71        if (change_flags & ~Chem.ChEMBLStandardizer.EXCLUDED):
 72            if args.verb_level == 2:
 73                return ('- Molecule %s: modified (flagged as excluded)\n%s' % (mol_id, getListOfChangesString(change_flags)))
 74            
 75            return ('- Molecule %s: modified (flagged as excluded)' % mol_id)
 76
 77        return ('- Molecule %s: forwarded unchanged (flagged as excluded)' % mol_id)
 78
 79    if change_flags:
 80        if args.verb_level == 2:
 81            return ('- Molecule %s: modified\n%s' % (mol_id, getListOfChangesString(change_flags)))
 82                        
 83        return ('- Molecule %s: modified' % mol_id)
 84            
 85    return ('- Molecule %s: forwarded unchanged' % mol_id)
 86
 87def parseArgs() -> argparse.Namespace:
 88    def strtobool(value: str) -> bool:
 89        value = value.lower()
 90        if value in ("y", "yes", "on", "1", "true", "t"):
 91            return True
 92        return False
 93    
 94    parser = argparse.ArgumentParser(description='Performs molecule standardization as done by the ChEMBL structure curation pipeline')
 95
 96    parser.add_argument('-i',
 97                        dest='in_file',
 98                        required=True,
 99                        metavar='<file>',
100                        help='Input molecule file')
101    parser.add_argument('-o',
102                        dest='out_file',
103                        required=True,
104                        metavar='<file>',
105                        help='Output molecule file')
106    parser.add_argument('-v',
107                        dest='verb_level',
108                        required=False,
109                        metavar='<0|1|2>',
110                        choices=range(0, 3),
111                        default=1,
112                        help='Verbosity level (default: 1; 0 -> no console output, 1 -> verbose, 2 -> extra verbose)',
113                        type=int)
114    parser.add_argument('-p',
115                        dest='extract_parent',
116                        required=False,
117                        metavar='<true|false>',
118                        type=lambda x:bool(strtobool(x)),
119                        default=True,
120                        help='Extract parent structure (default: true)')
121    parser.add_argument('-d',
122                        dest='drop_excluded',
123                        required=False,
124                        action='store_true',
125                        default=False,
126                        help='Drop structures that fulfill the exclusion criterions (default: false)')
127    parser.add_argument('-x',
128                        dest='proc_excluded',
129                        required=False,
130                        metavar='<true|false>',
131                        type=lambda x:bool(strtobool(x)),
132                        default=True,
133                        help='Standardize structures that fulfill the exclusion criterions (default: true)')
134
135    return parser.parse_args()
136
137def main() -> None:
138    args = parseArgs() # process command line arguments
139
140    # create reader for input molecules (format specified by file extension)
141    reader = Chem.MoleculeReader(args.in_file) 
142
143    # create writer for output molecules (format specified by file extension)
144    writer = Chem.MolecularGraphWriter(args.out_file) 
145
146    # do not update timestamp for output in an MDL format, just for testing purposes!
147    Chem.setMDLUpdateTimestampParameter(writer, False)
148    
149    # create instances of the default implementation of the Chem.Molecule interface for the input and output molecules
150    in_mol = Chem.BasicMolecule()
151    out_mol = Chem.BasicMolecule()
152
153    # create an instance of CDPKit's ChEMBL structure curation pipeline implementation
154    chembl_proc = Chem.ChEMBLStandardizer()
155    i = 1
156    
157    try:
158        # read and process molecules one after the other until the end of input has been reached (or a severe error occurs)
159        while reader.read(in_mol):
160            # compose a molecule identifier
161            mol_id = Chem.getName(in_mol).strip() 
162
163            if mol_id == '':
164                mol_id = '#' + str(i)  # fallback if name is empty or not available
165            else:
166                mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
167         
168            try:
169                # perform standardization and parent structure extraction (optional)
170                change_flags = standardize(chembl_proc, in_mol, out_mol, args) 
171
172                log_msg = getLogMessage(change_flags, args, mol_id)
173
174                if log_msg:
175                    print(log_msg)
176                
177                # check if the excluded flag has been set and take appropriate action
178                if (change_flags & Chem.ChEMBLStandardizer.EXCLUDED) and args.drop_excluded:
179                    continue
180                
181                try:
182                    # calculate (if not present) some basic properties of the output molecule
183                    # that might be required for writing (output format dependent)
184                    Chem.calcImplicitHydrogenCounts(out_mol, False)
185                    Chem.perceiveHybridizationStates(out_mol, False)
186                    Chem.perceiveSSSR(out_mol, False)
187                    Chem.setRingFlags(out_mol, False)
188                    Chem.setAromaticityFlags(out_mol, False)
189                    Chem.perceiveComponents(out_mol, False)
190                              
191                    # write output molecule
192                    if not writer.write(out_mol):
193                        sys.exit('Error: writing molecule %s failed' % mol_id)
194
195                except Exception as e: # handle exception raised in case of severe write errors
196                    sys.exit('Error: writing molecule %s failed: %s' % (mol_id, str(e)))
197                
198            except Exception as e: # handle exception raised in case of severe structure processing errors
199                sys.exit('Error: processing of molecule %s failed: %s' % (mol_id, str(e)))
200
201            i += 1
202            
203    except Exception as e: # handle exception raised in case of severe read errors
204        sys.exit('Error: reading of molecule %s failed: %s' % (str(i), str(e)))
205
206    writer.close()
207    sys.exit(0)
208        
209if __name__ == '__main__':
210    main()

Download source file