Skip to content

taps.apps.moldesign.chemfunctions

MorganFingerprintTransformer

MorganFingerprintTransformer(
    length: int = 256, radius: int = 4
)

Bases: BaseEstimator, TransformerMixin

Class that converts SMILES strings to fingerprint vectors.

Source code in taps/apps/moldesign/chemfunctions.py
def __init__(self, length: int = 256, radius: int = 4) -> None:
    self.length = length
    self.radius = radius

fit()

fit(X: Any, y: Any = None) -> Self

Fit the transformer.

Source code in taps/apps/moldesign/chemfunctions.py
def fit(self, X: Any, y: Any = None) -> Self:  # noqa: N803
    """Fit the transformer."""
    return self  # Do need to do anything

transform()

transform(X: Any, y: Any = None) -> Any

Compute the fingerprints.

Parameters:

  • X (Any) –

    List of SMILES strings.

  • y (Any, default: None ) –

    Ignored.

Returns:

  • Any

    Array of fingerprints.

Source code in taps/apps/moldesign/chemfunctions.py
def transform(self, X: Any, y: Any = None) -> Any:  # noqa: N803
    """Compute the fingerprints.

    Args:
        X: List of SMILES strings.
        y: Ignored.

    Returns:
        Array of fingerprints.
    """
    my_func = partial(
        compute_morgan_fingerprints,
        fingerprint_length=self.length,
        fingerprint_radius=self.radius,
    )
    with ProcessPoolExecutor(max_workers=n_workers) as pool:
        fing = list(pool.map(my_func, X, chunksize=2048))
    return numpy.vstack(fing)

generate_initial_xyz()

generate_initial_xyz(mol_string: str) -> str

Generate the XYZ coordinates for a molecule.

Parameters:

  • mol_string (str) –

    SMILES string.

Returns:

  • str

    XYZ coordinates for the molecule.

Source code in taps/apps/moldesign/chemfunctions.py
def generate_initial_xyz(mol_string: str) -> str:
    """Generate the XYZ coordinates for a molecule.

    Args:
        mol_string: SMILES string.

    Returns:
        XYZ coordinates for the molecule.
    """
    # Generate 3D coordinates for the molecule
    mol = Chem.MolFromSmiles(mol_string)
    if mol is None:
        raise ValueError(f'Parse failure for {mol_string}')
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, randomSeed=1)
    AllChem.MMFFOptimizeMolecule(mol)

    # Save geometry as 3D coordinates
    xyz = f'{mol.GetNumAtoms()}\n'
    xyz += mol_string + '\n'
    conf = mol.GetConformer()
    for i, a in enumerate(mol.GetAtoms()):
        s = a.GetSymbol()
        c = conf.GetAtomPosition(i)
        xyz += f'{s} {c[0]} {c[1]} {c[2]}\n'

    return xyz

compute_vertical()

compute_vertical(smiles: str) -> float

Run the ionization potential computation.

Parameters:

  • smiles (str) –

    SMILES string to evaluate.

Returns:

  • float

    Ionization energy in Ha.

Source code in taps/apps/moldesign/chemfunctions.py
def compute_vertical(smiles: str) -> float:
    """Run the ionization potential computation.

    Args:
        smiles: SMILES string to evaluate.

    Returns:
        Ionization energy in Ha.
    """
    # Make the initial geometry
    xyz = generate_initial_xyz(smiles)

    # Make the XTB calculator
    calc = XTB(accuracy=0.05)

    # Parse the molecule
    atoms = read(StringIO(xyz), format='xyz')

    # Compute the neutral geometry. Uses QCEngine
    # (https://github.com/MolSSI/QCEngine) to handle interfaces to XTB.
    atoms.calc = calc
    dyn = LBFGSLineSearch(atoms, logfile=None)
    dyn.run(fmax=0.02, steps=250)

    neutral_energy = atoms.get_potential_energy()

    # Compute the energy of the relaxed geometry in charged form
    charges = numpy.ones((len(atoms),)) * (1 / len(atoms))
    atoms.set_initial_charges(charges)
    charged_energy = atoms.get_potential_energy()

    return charged_energy - neutral_energy

compute_morgan_fingerprints()

compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[bool]

Get Morgan Fingerprint of a specific SMILES string.

Adapted from: https://github.com/google-research/google-research/blob/dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

Parameters:

  • smiles (str) –

    The molecule as a SMILES string.

  • fingerprint_length (int) –

    Bit-length of fingerprint.

  • fingerprint_radius (int) –

    Radius used to compute fingerprint.

Returns:

  • NDArray[bool]

    Array with shape [hparams, fingerprint_length] of the Morgan fingerprint.

Source code in taps/apps/moldesign/chemfunctions.py
def compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[numpy.bool]:
    """Get Morgan Fingerprint of a specific SMILES string.

    Adapted from:
    https://github.com/google-research/google-research/blob/dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

    Args:
        smiles: The molecule as a SMILES string.
        fingerprint_length: Bit-length of fingerprint.
        fingerprint_radius: Radius used to compute fingerprint.

    Returns:
        Array with shape `[hparams, fingerprint_length]` of the Morgan \
        fingerprint.
    """
    # Parse the molecule
    molecule = Chem.MolFromSmiles(smiles)

    # Compute the fingerprint
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(
        molecule,
        fingerprint_radius,
        fingerprint_length,
    )
    arr = numpy.zeros((1,), dtype=numpy.bool_)

    # ConvertToNumpyArray takes ~ 0.19 ms, while
    # numpy.asarray takes ~ 4.69 ms
    DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr

train_model()

train_model(
    smiles: list[str], properties: list[float]
) -> Pipeline

Train a machine learning model using Morgan Fingerprints.

Parameters:

  • smiles (list[str]) –

    SMILES strings for each molecule

  • properties (list[float]) –

    List of a property for each molecule

Returns:

  • Pipeline

    A trained model.

Source code in taps/apps/moldesign/chemfunctions.py
def train_model(smiles: list[str], properties: list[float]) -> Pipeline:
    """Train a machine learning model using Morgan Fingerprints.

    Args:
        smiles: SMILES strings for each molecule
        properties: List of a property for each molecule

    Returns:
        A trained model.
    """
    model = Pipeline(
        [
            ('fingerprint', MorganFingerprintTransformer()),
            (
                'knn',
                KNeighborsRegressor(
                    n_neighbors=4,
                    weights='distance',
                    metric='jaccard',
                    n_jobs=-1,
                ),
            ),
        ],
    )

    return model.fit(smiles, properties)

run_model()

run_model(model: Any, smiles: list[str]) -> DataFrame

Run a model on a list of smiles strings.

Parameters:

  • model (Any) –

    Trained model that takes SMILES strings as inputs.

  • smiles (list[str]) –

    List of molecules to evaluate.

Returns:

  • DataFrame

    A dataframe with the molecules and their predicted outputs.

Source code in taps/apps/moldesign/chemfunctions.py
def run_model(model: Any, smiles: list[str]) -> pandas.DataFrame:
    """Run a model on a list of smiles strings.

    Args:
        model: Trained model that takes SMILES strings as inputs.
        smiles: List of molecules to evaluate.

    Returns:
        A dataframe with the molecules and their predicted outputs.
    """
    pred_y = model.predict(smiles)
    return pandas.DataFrame({'smiles': smiles, 'ie': pred_y})