4.1.1. Extended Connectivity Fingerprints (ECFPs)

The script gen_ecfp.py generates and outputs the Extended Connectivity Fingerprint (ECFP) [3] of molecules read from a specified input file.

Synopsis

python gen_ecfp.py [-h] -i <file> -o <file> [-n <integer>] [-r <integer>] [-y] [-c]

Mandatory options

-i <file>

Input molecule file

-o <file>

Fingerprint output file

Other options

-h, --help

Show help message and exit

-n <integer>

Fingerprint size in bits (default: 1024)

-r <integer>

Max. atom environment radius in number of bonds (default: 2)

-y

Do not ignore hydrogens (by default, the fingerprint is generated for the H-deplete molecular graph)

-c

Include atom chirality (by default, atom chirality is not considered)

Code

  1import sys
  2import argparse
  3
  4import CDPL.Chem as Chem
  5import CDPL.Descr as Descr
  6import CDPL.Util as Util
  7
  8
  9# generates the binary ECFP of the given molecule
 10def genECFP(mol: Chem.Molecule, num_bits: int, radius: int, inc_hs: bool, inc_config: bool) -> Util.BitSet:
 11    Chem.calcBasicProperties(mol, False)            # calculate basic molecular properties (if not yet done)
 12   
 13    ecfp_gen = Descr.CircularFingerprintGenerator() # create ECFP generator instance
 14
 15    if inc_config:
 16        ecfp_gen.includeChirality(True)                  # allow atom chirality to have an impact on the ECFP generation
 17        Chem.calcCIPPriorities(mol, False)               # calculate atom symmetry classes for chiral atom perception and set corresponding property for all atoms
 18        Chem.perceiveAtomStereoCenters(mol, False, True) # perceive chiral atoms and set corresponding property for all atoms
 19        Chem.calcAtomStereoDescriptors(mol, False)       # calculate atom stereo descriptors and set corresponding property for all atoms
 20
 21    if inc_hs:        
 22        ecfp_gen.includeHydrogens(True)                # include explicit hydrogens in the ECFP generation
 23        Chem.makeHydrogenComplete(mol)                 # make any implicit hydrogens explicit
 24         
 25    fp = Util.BitSet()                                 # create fingerprint bitset
 26    fp.resize(num_bits)                                # set desired fingerprint size
 27
 28    ecfp_gen.setNumIterations(radius)                  # set num. iterations (=atom. env. radius)
 29    ecfp_gen.generate(mol)                             # extract chracteristic structural features
 30    ecfp_gen.setFeatureBits(fp)                        # set bits associated with the extracted structural features
 31
 32    # if needed, fp could be converted into a numpy single precision float array as follows:
 33    # fp = numpy.array(fp, dtype=numpy.float32)
 34    
 35    return fp
 36    
 37def parseArgs() -> argparse.Namespace:
 38    parser = argparse.ArgumentParser(description='Generates extended connectivity fingerprints (ECFPs) for given input molecules.')
 39
 40    parser.add_argument('-i',
 41                        dest='in_file',
 42                        required=True,
 43                        metavar='<file>',
 44                        help='Input molecule file')
 45    parser.add_argument('-o',
 46                        dest='out_file',
 47                        required=True,
 48                        metavar='<file>',
 49                        help='Fingerprint output file')
 50    parser.add_argument('-n',
 51                        dest='num_bits',
 52                        required=False,
 53                        metavar='<integer>',
 54                        default=1024,
 55                        help='Fingerprint size in bits (default: 1024)',
 56                        type=int)
 57    parser.add_argument('-r',
 58                        dest='radius',
 59                        required=False,
 60                        metavar='<integer>',
 61                        default=2,
 62                        help='Max. atom environment radius in number of bonds (default: 2)',
 63                        type=int)
 64    parser.add_argument('-y',
 65                        dest='inc_hs',
 66                        required=False,
 67                        action='store_true',
 68                        default=False,
 69                        help='Do not ignore hydrogens (by default, the fingerprint is generated for the H-deplete molecular graph)')
 70    parser.add_argument('-c',
 71                        dest='inc_config',
 72                        required=False,
 73                        action='store_true',
 74                        default=False,
 75                        help='Include atom chirality (by default, atom chirality is not considered)')
 76
 77    return parser.parse_args()
 78    
 79def main() -> None:
 80    args = parseArgs()
 81
 82    # create reader for input molecules (format specified by file extension)
 83    reader = Chem.MoleculeReader(args.in_file) 
 84
 85    # open output file storing the generated fingerprints
 86    out_file = open(args.out_file, 'w')
 87    
 88    # create an instance of the default implementation of the Chem.Molecule interface
 89    mol = Chem.BasicMolecule()
 90
 91    # read and process molecules one after the other until the end of input has been reached
 92    try:
 93        while reader.read(mol):
 94            try:
 95                fp = genECFP(mol, args.num_bits, args.radius, args.inc_hs, args.inc_config)
 96
 97                out_file.write(str(fp))
 98                out_file.write('\n')
 99
100            except Exception as e:
101                sys.exit('Error: processing of molecule failed: ' + str(e))
102                
103    except Exception as e: # handle exception raised in case of severe read errors
104        sys.exit('Error: reading molecule failed: ' + str(e))
105
106    out_file.close()
107    sys.exit(0)
108        
109if __name__ == '__main__':
110    main()

Download source file