1.1.4.1. Testing for the Presence of Substructures
The script substruct_filter.py reads molecules from an input file and writes all molecules that match the specified SMARTS substructure pattern to the output file.
Synopsis
python substruct_filter.py [-h] -i <file> -o <file> -p <SMARTS> [-q] [-m]
Mandatory options
- -i <file>
Molecule input file
- -o <file>
Molecule output file
- -p <SMARTS>
SMARTS pattern describing the substructure to search for
Other options
- -h, --help
Show help message and exit
- -m
Set atom mapping ids of output molecule atoms to the ids of the matching SMARTS pattern atoms (default: false)
- -q
Disable progress output (default: false)
Code
1import sys
2import argparse
3
4import CDPL.Chem as Chem
5
6
7def filterMolecules() -> None:
8 args = parseArgs()
9
10 # create reader for input molecules (format specified by file extension)
11 reader = Chem.MoleculeReader(args.in_file)
12
13 # create writer for the output of matching molecules (format specified by file extension)
14 writer = Chem.MolecularGraphWriter(args.out_file)
15
16 # parse the substructure SMARTS pattern
17 try:
18 sub_srch_ptn = Chem.parseSMARTS(args.smarts_ptn)
19 except Exception as e:
20 sys.exit('Error: parsing of SMARTS pattern failed:\n%s' % str(e))
21
22 # create and initialize an instance of the class Chem.SubstructureSearch that
23 # implements the substructure searching algorithm
24 substr_srch = Chem.SubstructureSearch(sub_srch_ptn)
25
26 # special settings for option -m
27 if args.mark_matched_atoms:
28 # store only the first found substructure mapping
29 substr_srch.setMaxNumMappings(1)
30 # in case of SMILES output: write atom-atom mapping ids, if available
31 Chem.setSMILESMolOutputAtomMappingIDParameter(writer, True)
32
33 # create an instance of the default implementation of the Chem.Molecule interface
34 mol = Chem.BasicMolecule()
35 i = 1
36
37 # read and process molecules one after the other until the end of input has been reached
38 try:
39 while reader.read(mol):
40 # compose a simple molecule identifier
41 mol_id = Chem.getName(mol).strip()
42
43 if mol_id == '':
44 mol_id = '#' + str(i) # fallback if name is empty
45 else:
46 mol_id = '\'%s\' (#%s)' % (mol_id, str(i))
47
48 if not args.quiet:
49 print('- Searching for a matching substructure in molecule %s...' % mol_id)
50
51 try:
52 Chem.initSubstructureSearchTarget(mol, False)
53
54 if args.mark_matched_atoms:
55 found_match = substr_srch.findMappings(mol) # for option -m atom-atom mapping data are required
56 else:
57 found_match = substr_srch.mappingExists(mol) # if option -m is false, information that a mapping exists is sufficient
58
59 if found_match:
60 if not args.quiet:
61 print(' -> substructure found, forwarding molecule to output file')
62
63 # processing for option -m
64 if args.mark_matched_atoms:
65 for ap in substr_srch.getMapping(0).atomMapping.items(): # for each [ptn. atom, mol. atom] pair of the found substructure match
66 if Chem.hasAtomMappingID(ap[0]): # if the pattern atom has an atom-atom mapping id
67 Chem.setAtomMappingID(ap[1], Chem.getAtomMappingID(ap[0])) # set the id of the matched molecule atom to the same value
68
69 # output the matched molecule
70 if not writer.write(mol):
71 sys.exit('Error: output of molecule failed')
72
73 elif not args.quiet:
74 print(' -> substructure not found')
75
76 except Exception as e:
77 sys.exit('Error: substructure search or output of molecule %s failed:\n%s' % (mol_id, str(e)))
78
79 i += 1
80
81 except Exception as e: # handle exception raised in case of severe read errors
82 sys.exit('Error: reading molecule failed:\n' + str(e))
83
84 writer.close()
85 sys.exit(0)
86
87def parseArgs() -> argparse.Namespace:
88 parser = argparse.ArgumentParser(description='Writes input molecules that match the specified SMARTS substructure pattern to an output file.')
89
90 parser.add_argument('-i',
91 dest='in_file',
92 required=True,
93 metavar='<file>',
94 help='Molecule input file')
95 parser.add_argument('-o',
96 dest='out_file',
97 required=True,
98 metavar='<file>',
99 help='Molecule output file')
100 parser.add_argument('-p',
101 dest='smarts_ptn',
102 required=True,
103 metavar='<SMARTS>',
104 help='SMARTS pattern describing the substructure to search for')
105 parser.add_argument('-m',
106 dest='mark_matched_atoms',
107 required=False,
108 action='store_true',
109 default=False,
110 help='Set atom mapping ids of output molecule atoms to the ids of the matching SMARTS pattern atoms (default: false)')
111 parser.add_argument('-q',
112 dest='quiet',
113 required=False,
114 action='store_true',
115 default=False,
116 help='Disable progress output (default: false)')
117
118 return parser.parse_args()
119
120if __name__ == '__main__':
121 filterMolecules()