1.1.4.1. Testing for the Presence of Substructures

The script substruct_filter.py reads molecules from an input file and writes all molecules that match the specified SMARTS substructure pattern to the output file.

Synopsis

python substruct_filter.py [-h] -i <file> -o <file> -p <SMARTS> [-q] [-m]

Mandatory options

-i <file>

Molecule input file

-o <file>

Molecule output file

-p <SMARTS>

SMARTS pattern describing the substructure to search for

Other options

-h, --help

Show help message and exit

-m

Set atom mapping ids of output molecule atoms to the ids of the matching SMARTS pattern atoms (default: false)

-q

Disable progress output (default: false)

Code

  1import sys
  2import argparse
  3
  4import CDPL.Chem as Chem
  5
  6
  7def filterMolecules() -> None:
  8    args = parseArgs()
  9    
 10    # create reader for input molecules (format specified by file extension)
 11    reader = Chem.MoleculeReader(args.in_file) 
 12
 13    # create writer for the output of matching molecules (format specified by file extension)
 14    writer = Chem.MolecularGraphWriter(args.out_file) 
 15
 16    # parse the substructure SMARTS pattern
 17    try:
 18        sub_srch_ptn = Chem.parseSMARTS(args.smarts_ptn)
 19    except Exception as e:
 20        sys.exit('Error: parsing of SMARTS pattern failed:\n%s' % str(e))
 21
 22    # create and initialize an instance of the class Chem.SubstructureSearch that
 23    # implements the substructure searching algorithm
 24    substr_srch = Chem.SubstructureSearch(sub_srch_ptn)
 25
 26    # special settings for option -m
 27    if args.mark_matched_atoms:
 28        # store only the first found substructure mapping
 29        substr_srch.setMaxNumMappings(1)
 30         # in case of SMILES output: write atom-atom mapping ids, if available
 31        Chem.setSMILESMolOutputAtomMappingIDParameter(writer, True)
 32    
 33    # create an instance of the default implementation of the Chem.Molecule interface
 34    mol = Chem.BasicMolecule()
 35    i = 1
 36    
 37    # read and process molecules one after the other until the end of input has been reached
 38    try:
 39        while reader.read(mol):
 40            # compose a simple molecule identifier
 41            mol_id = Chem.getName(mol).strip() 
 42
 43            if mol_id == '':
 44                mol_id = '#' + str(i) # fallback if name is empty
 45            else:
 46                mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
 47
 48            if not args.quiet:
 49                print('- Searching for a matching substructure in molecule %s...' % mol_id)
 50
 51            try:
 52                Chem.initSubstructureSearchTarget(mol, False)
 53
 54                if args.mark_matched_atoms:
 55                    found_match = substr_srch.findMappings(mol)  # for option -m atom-atom mapping data are required
 56                else:
 57                    found_match = substr_srch.mappingExists(mol) # if option -m is false, information that a mapping exists is sufficient
 58                    
 59                if found_match:
 60                    if not args.quiet:
 61                        print(' -> substructure found, forwarding molecule to output file')
 62
 63                    # processing for option -m
 64                    if args.mark_matched_atoms:
 65                        for ap in substr_srch.getMapping(0).atomMapping.items(): # for each [ptn. atom, mol. atom] pair of the found substructure match
 66                            if Chem.hasAtomMappingID(ap[0]):                                 # if the pattern atom has an atom-atom mapping id
 67                                Chem.setAtomMappingID(ap[1], Chem.getAtomMappingID(ap[0]))   # set the id of the matched molecule atom to the same value
 68                        
 69                    # output the matched molecule
 70                    if not writer.write(mol):   
 71                        sys.exit('Error: output of molecule failed')
 72
 73                elif not args.quiet: 
 74                    print(' -> substructure not found')
 75                        
 76            except Exception as e:
 77                sys.exit('Error: substructure search or output of molecule %s failed:\n%s' % (mol_id, str(e)))
 78
 79            i += 1
 80                
 81    except Exception as e: # handle exception raised in case of severe read errors
 82        sys.exit('Error: reading molecule failed:\n' + str(e))
 83
 84    writer.close()
 85    sys.exit(0)
 86        
 87def parseArgs() -> argparse.Namespace:
 88    parser = argparse.ArgumentParser(description='Writes input molecules that match the specified SMARTS substructure pattern to an output file.')
 89
 90    parser.add_argument('-i',
 91                        dest='in_file',
 92                        required=True,
 93                        metavar='<file>',
 94                        help='Molecule input file')
 95    parser.add_argument('-o',
 96                        dest='out_file',
 97                        required=True,
 98                        metavar='<file>',
 99                        help='Molecule output file')
100    parser.add_argument('-p',
101                        dest='smarts_ptn',
102                        required=True,
103                        metavar='<SMARTS>',
104                        help='SMARTS pattern describing the substructure to search for')
105    parser.add_argument('-m',
106                        dest='mark_matched_atoms',
107                        required=False,
108                        action='store_true',
109                        default=False,
110                        help='Set atom mapping ids of output molecule atoms to the ids of the matching SMARTS pattern atoms (default: false)')
111    parser.add_argument('-q',
112                        dest='quiet',
113                        required=False,
114                        action='store_true',
115                        default=False,
116                        help='Disable progress output (default: false)')
117    
118    return parser.parse_args()
119
120if __name__ == '__main__':
121    filterMolecules()

Download source file