4.1.1. Extended Connectivity Fingerprints (ECFPs)
The script gen_ecfp.py generates and outputs the Extended Connectivity Fingerprint (ECFP) [3] of molecules read from a specified input file.
Synopsis
python gen_ecfp.py [-h] -i <file> -o <file> [-n <integer>] [-r <integer>] [-y] [-c]
Mandatory options
- -i <file>
Input molecule file
- -o <file>
Fingerprint output file
Other options
- -h, --help
Show help message and exit
- -n <integer>
Fingerprint size in bits (default: 1024)
- -r <integer>
Max. atom environment radius in number of bonds (default: 2)
- -y
Do not ignore hydrogens (by default, the fingerprint is generated for the H-deplete molecular graph)
- -c
Include atom chirality (by default, atom chirality is not considered)
Code
1import sys
2import argparse
3
4import CDPL.Chem as Chem
5import CDPL.Descr as Descr
6import CDPL.Util as Util
7
8
9# generates the binary ECFP of the given molecule
10def genECFP(mol: Chem.Molecule, num_bits: int, radius: int, inc_hs: bool, inc_config: bool) -> Util.BitSet:
11 Chem.calcBasicProperties(mol, False) # calculate basic molecular properties (if not yet done)
12
13 ecfp_gen = Descr.CircularFingerprintGenerator() # create ECFP generator instance
14
15 if inc_config:
16 ecfp_gen.includeChirality(True) # allow atom chirality to have an impact on the ECFP generation
17 Chem.calcCIPPriorities(mol, False) # calculate atom symmetry classes for chiral atom perception and set corresponding property for all atoms
18 Chem.perceiveAtomStereoCenters(mol, False, True) # perceive chiral atoms and set corresponding property for all atoms
19 Chem.calcAtomStereoDescriptors(mol, False) # calculate atom stereo descriptors and set corresponding property for all atoms
20
21 if inc_hs:
22 ecfp_gen.includeHydrogens(True) # include explicit hydrogens in the ECFP generation
23 Chem.makeHydrogenComplete(mol) # make any implicit hydrogens explicit
24
25 fp = Util.BitSet() # create fingerprint bitset
26 fp.resize(num_bits) # set desired fingerprint size
27
28 ecfp_gen.setNumIterations(radius) # set num. iterations (=atom. env. radius)
29 ecfp_gen.generate(mol) # extract chracteristic structural features
30 ecfp_gen.setFeatureBits(fp) # set bits associated with the extracted structural features
31
32 # if needed, fp could be converted into a numpy single precision float array as follows:
33 # fp = numpy.array(fp, dtype=numpy.float32)
34
35 return fp
36
37def parseArgs() -> argparse.Namespace:
38 parser = argparse.ArgumentParser(description='Generates extended connectivity fingerprints (ECFPs) for given input molecules.')
39
40 parser.add_argument('-i',
41 dest='in_file',
42 required=True,
43 metavar='<file>',
44 help='Input molecule file')
45 parser.add_argument('-o',
46 dest='out_file',
47 required=True,
48 metavar='<file>',
49 help='Fingerprint output file')
50 parser.add_argument('-n',
51 dest='num_bits',
52 required=False,
53 metavar='<integer>',
54 default=1024,
55 help='Fingerprint size in bits (default: 1024)',
56 type=int)
57 parser.add_argument('-r',
58 dest='radius',
59 required=False,
60 metavar='<integer>',
61 default=2,
62 help='Max. atom environment radius in number of bonds (default: 2)',
63 type=int)
64 parser.add_argument('-y',
65 dest='inc_hs',
66 required=False,
67 action='store_true',
68 default=False,
69 help='Do not ignore hydrogens (by default, the fingerprint is generated for the H-deplete molecular graph)')
70 parser.add_argument('-c',
71 dest='inc_config',
72 required=False,
73 action='store_true',
74 default=False,
75 help='Include atom chirality (by default, atom chirality is not considered)')
76
77 return parser.parse_args()
78
79def main() -> None:
80 args = parseArgs()
81
82 # create reader for input molecules (format specified by file extension)
83 reader = Chem.MoleculeReader(args.in_file)
84
85 # open output file storing the generated fingerprints
86 out_file = open(args.out_file, 'w')
87
88 # create an instance of the default implementation of the Chem.Molecule interface
89 mol = Chem.BasicMolecule()
90
91 # read and process molecules one after the other until the end of input has been reached
92 try:
93 while reader.read(mol):
94 try:
95 fp = genECFP(mol, args.num_bits, args.radius, args.inc_hs, args.inc_config)
96
97 out_file.write(str(fp))
98 out_file.write('\n')
99
100 except Exception as e:
101 sys.exit('Error: processing of molecule failed: ' + str(e))
102
103 except Exception as e: # handle exception raised in case of severe read errors
104 sys.exit('Error: reading molecule failed: ' + str(e))
105
106 out_file.close()
107 sys.exit(0)
108
109if __name__ == '__main__':
110 main()